Add prefix to QC8/QS8/QU8 WAsm SIMD GEMM/IGEMM microkernels

Specify computation type (MUL16/MUL32) in the name of a microkernel. This is
necessary in preparation for a new set of microkernels using 16x2->32 dot
product instructions from Chrome M87 WAsm SIMD specification.

PiperOrigin-RevId: 389550897
diff --git a/test/qc8-gemm-minmax-fp32.cc b/test/qc8-gemm-minmax-fp32.cc
index 3ae3e70..661b7fb 100644
--- a/test/qc8-gemm-minmax-fp32.cc
+++ b/test/qc8-gemm-minmax-fp32.cc
@@ -47438,7 +47438,7 @@
 
 
 #if XNN_ARCH_WASMSIMD
-  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, k_eq_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, k_eq_8) {
     GemmMicrokernelTester()
       .mr(1)
       .nr(4)
@@ -47447,10 +47447,10 @@
       .m(1)
       .n(4)
       .k(8)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, strided_cn) {
     GemmMicrokernelTester()
       .mr(1)
       .nr(4)
@@ -47460,10 +47460,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, k_eq_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, k_eq_8_strided_a) {
     GemmMicrokernelTester()
       .mr(1)
       .nr(4)
@@ -47473,10 +47473,10 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, k_eq_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, k_eq_8_subtile) {
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
         GemmMicrokernelTester()
@@ -47488,12 +47488,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, k_eq_8_subtile_m) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, k_eq_8_subtile_m) {
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
         .mr(1)
@@ -47504,11 +47504,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, k_eq_8_subtile_n) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, k_eq_8_subtile_n) {
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
         .mr(1)
@@ -47519,11 +47519,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, k_lt_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, k_lt_8) {
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
         .mr(1)
@@ -47533,11 +47533,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, k_lt_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, k_lt_8_strided_a) {
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
         .mr(1)
@@ -47548,11 +47548,11 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, k_lt_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, k_lt_8_subtile) {
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -47565,13 +47565,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, k_gt_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, k_gt_8) {
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
         .mr(1)
@@ -47581,11 +47581,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, k_gt_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, k_gt_8_strided_a) {
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
         .mr(1)
@@ -47596,11 +47596,11 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, k_gt_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, k_gt_8_subtile) {
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -47613,13 +47613,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, k_div_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, k_div_8) {
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
         .mr(1)
@@ -47629,11 +47629,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, k_div_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, k_div_8_strided_a) {
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
         .mr(1)
@@ -47644,11 +47644,11 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, k_div_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, k_div_8_subtile) {
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -47661,13 +47661,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, n_gt_4) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, n_gt_4) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -47678,12 +47678,12 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, n_gt_4_strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, n_gt_4_strided_cn) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -47695,12 +47695,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, n_gt_4_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, n_gt_4_strided_a) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -47712,12 +47712,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, n_gt_4_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, n_gt_4_subtile) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 1; m++) {
@@ -47730,13 +47730,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, n_div_4) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, n_div_4) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -47747,12 +47747,12 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, n_div_4_strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, n_div_4_strided_cn) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -47764,12 +47764,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, n_div_4_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, n_div_4_strided_a) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -47781,12 +47781,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, n_div_4_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, n_div_4_subtile) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 1; m++) {
@@ -47799,13 +47799,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, strided_cm_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, strided_cm_subtile) {
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -47819,13 +47819,13 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, qmin) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, qmin) {
     GemmMicrokernelTester()
       .mr(1)
       .nr(4)
@@ -47835,10 +47835,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, qmax) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, qmax) {
     GemmMicrokernelTester()
       .mr(1)
       .nr(4)
@@ -47848,10 +47848,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, strided_cm) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, strided_cm) {
     GemmMicrokernelTester()
       .mr(1)
       .nr(4)
@@ -47861,13 +47861,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_WASMSIMD
 
 
 #if XNN_ARCH_WASMSIMD
-  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, k_eq_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, k_eq_8) {
     GemmMicrokernelTester()
       .mr(2)
       .nr(4)
@@ -47876,10 +47876,10 @@
       .m(2)
       .n(4)
       .k(8)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, strided_cn) {
     GemmMicrokernelTester()
       .mr(2)
       .nr(4)
@@ -47889,10 +47889,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, k_eq_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, k_eq_8_strided_a) {
     GemmMicrokernelTester()
       .mr(2)
       .nr(4)
@@ -47902,10 +47902,10 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, k_eq_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, k_eq_8_subtile) {
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
         GemmMicrokernelTester()
@@ -47917,12 +47917,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, k_eq_8_subtile_m) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, k_eq_8_subtile_m) {
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
         .mr(2)
@@ -47933,11 +47933,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, k_eq_8_subtile_n) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, k_eq_8_subtile_n) {
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
         .mr(2)
@@ -47948,11 +47948,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, k_lt_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, k_lt_8) {
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
         .mr(2)
@@ -47962,11 +47962,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, k_lt_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, k_lt_8_strided_a) {
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
         .mr(2)
@@ -47977,11 +47977,11 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, k_lt_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, k_lt_8_subtile) {
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -47994,13 +47994,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, k_gt_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, k_gt_8) {
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
         .mr(2)
@@ -48010,11 +48010,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, k_gt_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, k_gt_8_strided_a) {
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
         .mr(2)
@@ -48025,11 +48025,11 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, k_gt_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, k_gt_8_subtile) {
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -48042,13 +48042,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, k_div_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, k_div_8) {
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
         .mr(2)
@@ -48058,11 +48058,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, k_div_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, k_div_8_strided_a) {
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
         .mr(2)
@@ -48073,11 +48073,11 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, k_div_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, k_div_8_subtile) {
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 2; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -48090,13 +48090,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, n_gt_4) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, n_gt_4) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -48107,12 +48107,12 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, n_gt_4_strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, n_gt_4_strided_cn) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -48124,12 +48124,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, n_gt_4_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, n_gt_4_strided_a) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -48141,12 +48141,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, n_gt_4_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, n_gt_4_subtile) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 2; m++) {
@@ -48159,13 +48159,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, n_div_4) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, n_div_4) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -48176,12 +48176,12 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, n_div_4_strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, n_div_4_strided_cn) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -48193,12 +48193,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, n_div_4_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, n_div_4_strided_a) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -48210,12 +48210,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, n_div_4_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, n_div_4_subtile) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 2; m++) {
@@ -48228,13 +48228,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, strided_cm_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, strided_cm_subtile) {
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 2; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -48248,13 +48248,13 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, qmin) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, qmin) {
     GemmMicrokernelTester()
       .mr(2)
       .nr(4)
@@ -48264,10 +48264,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, qmax) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, qmax) {
     GemmMicrokernelTester()
       .mr(2)
       .nr(4)
@@ -48277,10 +48277,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, strided_cm) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, strided_cm) {
     GemmMicrokernelTester()
       .mr(2)
       .nr(4)
@@ -48290,13 +48290,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_WASMSIMD
 
 
 #if XNN_ARCH_WASMSIMD
-  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, k_eq_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, k_eq_8) {
     GemmMicrokernelTester()
       .mr(3)
       .nr(4)
@@ -48305,10 +48305,10 @@
       .m(3)
       .n(4)
       .k(8)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, strided_cn) {
     GemmMicrokernelTester()
       .mr(3)
       .nr(4)
@@ -48318,10 +48318,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, k_eq_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, k_eq_8_strided_a) {
     GemmMicrokernelTester()
       .mr(3)
       .nr(4)
@@ -48331,10 +48331,10 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, k_eq_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, k_eq_8_subtile) {
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
         GemmMicrokernelTester()
@@ -48346,12 +48346,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, k_eq_8_subtile_m) {
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, k_eq_8_subtile_m) {
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
         .mr(3)
@@ -48362,11 +48362,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, k_eq_8_subtile_n) {
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, k_eq_8_subtile_n) {
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
         .mr(3)
@@ -48377,11 +48377,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, k_lt_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, k_lt_8) {
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
         .mr(3)
@@ -48391,11 +48391,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, k_lt_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, k_lt_8_strided_a) {
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
         .mr(3)
@@ -48406,11 +48406,11 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, k_lt_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, k_lt_8_subtile) {
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -48423,13 +48423,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, k_gt_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, k_gt_8) {
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
         .mr(3)
@@ -48439,11 +48439,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, k_gt_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, k_gt_8_strided_a) {
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
         .mr(3)
@@ -48454,11 +48454,11 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, k_gt_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, k_gt_8_subtile) {
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -48471,13 +48471,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, k_div_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, k_div_8) {
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
         .mr(3)
@@ -48487,11 +48487,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, k_div_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, k_div_8_strided_a) {
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
         .mr(3)
@@ -48502,11 +48502,11 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, k_div_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, k_div_8_subtile) {
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -48519,13 +48519,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, n_gt_4) {
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, n_gt_4) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -48536,12 +48536,12 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, n_gt_4_strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, n_gt_4_strided_cn) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -48553,12 +48553,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, n_gt_4_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, n_gt_4_strided_a) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -48570,12 +48570,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, n_gt_4_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, n_gt_4_subtile) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 3; m++) {
@@ -48588,13 +48588,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, n_div_4) {
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, n_div_4) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -48605,12 +48605,12 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, n_div_4_strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, n_div_4_strided_cn) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -48622,12 +48622,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, n_div_4_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, n_div_4_strided_a) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -48639,12 +48639,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, n_div_4_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, n_div_4_subtile) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 3; m++) {
@@ -48657,13 +48657,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, strided_cm_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, strided_cm_subtile) {
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -48677,13 +48677,13 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, qmin) {
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, qmin) {
     GemmMicrokernelTester()
       .mr(3)
       .nr(4)
@@ -48693,10 +48693,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, qmax) {
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, qmax) {
     GemmMicrokernelTester()
       .mr(3)
       .nr(4)
@@ -48706,10 +48706,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, strided_cm) {
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, strided_cm) {
     GemmMicrokernelTester()
       .mr(3)
       .nr(4)
@@ -48719,13 +48719,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_WASMSIMD
 
 
 #if XNN_ARCH_WASMSIMD
-  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, k_eq_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, k_eq_8) {
     GemmMicrokernelTester()
       .mr(1)
       .nr(4)
@@ -48734,10 +48734,10 @@
       .m(1)
       .n(4)
       .k(8)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, strided_cn) {
     GemmMicrokernelTester()
       .mr(1)
       .nr(4)
@@ -48747,10 +48747,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, k_eq_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, k_eq_8_strided_a) {
     GemmMicrokernelTester()
       .mr(1)
       .nr(4)
@@ -48760,10 +48760,10 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, k_eq_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, k_eq_8_subtile) {
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
         GemmMicrokernelTester()
@@ -48775,12 +48775,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, k_eq_8_subtile_m) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, k_eq_8_subtile_m) {
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
         .mr(1)
@@ -48791,11 +48791,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, k_eq_8_subtile_n) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, k_eq_8_subtile_n) {
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
         .mr(1)
@@ -48806,11 +48806,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, k_lt_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, k_lt_8) {
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
         .mr(1)
@@ -48820,11 +48820,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, k_lt_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, k_lt_8_strided_a) {
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
         .mr(1)
@@ -48835,11 +48835,11 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, k_lt_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, k_lt_8_subtile) {
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -48852,13 +48852,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, k_gt_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, k_gt_8) {
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
         .mr(1)
@@ -48868,11 +48868,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, k_gt_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, k_gt_8_strided_a) {
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
         .mr(1)
@@ -48883,11 +48883,11 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, k_gt_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, k_gt_8_subtile) {
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -48900,13 +48900,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, k_div_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, k_div_8) {
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
         .mr(1)
@@ -48916,11 +48916,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, k_div_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, k_div_8_strided_a) {
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
         .mr(1)
@@ -48931,11 +48931,11 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, k_div_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, k_div_8_subtile) {
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -48948,13 +48948,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, n_gt_4) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, n_gt_4) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -48965,12 +48965,12 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, n_gt_4_strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, n_gt_4_strided_cn) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -48982,12 +48982,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, n_gt_4_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, n_gt_4_strided_a) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -48999,12 +48999,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, n_gt_4_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, n_gt_4_subtile) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 1; m++) {
@@ -49017,13 +49017,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, n_div_4) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, n_div_4) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -49034,12 +49034,12 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, n_div_4_strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, n_div_4_strided_cn) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -49051,12 +49051,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, n_div_4_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, n_div_4_strided_a) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -49068,12 +49068,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, n_div_4_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, n_div_4_subtile) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 1; m++) {
@@ -49086,13 +49086,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, strided_cm_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, strided_cm_subtile) {
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -49106,13 +49106,13 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, qmin) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, qmin) {
     GemmMicrokernelTester()
       .mr(1)
       .nr(4)
@@ -49122,10 +49122,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, qmax) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, qmax) {
     GemmMicrokernelTester()
       .mr(1)
       .nr(4)
@@ -49135,10 +49135,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, strided_cm) {
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, strided_cm) {
     GemmMicrokernelTester()
       .mr(1)
       .nr(4)
@@ -49148,13 +49148,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_WASMSIMD
 
 
 #if XNN_ARCH_WASMSIMD
-  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, k_eq_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, k_eq_8) {
     GemmMicrokernelTester()
       .mr(2)
       .nr(4)
@@ -49163,10 +49163,10 @@
       .m(2)
       .n(4)
       .k(8)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, strided_cn) {
     GemmMicrokernelTester()
       .mr(2)
       .nr(4)
@@ -49176,10 +49176,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, k_eq_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, k_eq_8_strided_a) {
     GemmMicrokernelTester()
       .mr(2)
       .nr(4)
@@ -49189,10 +49189,10 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, k_eq_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, k_eq_8_subtile) {
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
         GemmMicrokernelTester()
@@ -49204,12 +49204,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, k_eq_8_subtile_m) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, k_eq_8_subtile_m) {
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
         .mr(2)
@@ -49220,11 +49220,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, k_eq_8_subtile_n) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, k_eq_8_subtile_n) {
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
         .mr(2)
@@ -49235,11 +49235,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, k_lt_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, k_lt_8) {
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
         .mr(2)
@@ -49249,11 +49249,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, k_lt_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, k_lt_8_strided_a) {
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
         .mr(2)
@@ -49264,11 +49264,11 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, k_lt_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, k_lt_8_subtile) {
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -49281,13 +49281,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, k_gt_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, k_gt_8) {
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
         .mr(2)
@@ -49297,11 +49297,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, k_gt_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, k_gt_8_strided_a) {
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
         .mr(2)
@@ -49312,11 +49312,11 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, k_gt_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, k_gt_8_subtile) {
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -49329,13 +49329,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, k_div_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, k_div_8) {
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
         .mr(2)
@@ -49345,11 +49345,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, k_div_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, k_div_8_strided_a) {
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
         .mr(2)
@@ -49360,11 +49360,11 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, k_div_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, k_div_8_subtile) {
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 2; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -49377,13 +49377,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, n_gt_4) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, n_gt_4) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -49394,12 +49394,12 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, n_gt_4_strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, n_gt_4_strided_cn) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -49411,12 +49411,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, n_gt_4_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, n_gt_4_strided_a) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -49428,12 +49428,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, n_gt_4_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, n_gt_4_subtile) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 2; m++) {
@@ -49446,13 +49446,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, n_div_4) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, n_div_4) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -49463,12 +49463,12 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, n_div_4_strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, n_div_4_strided_cn) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -49480,12 +49480,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, n_div_4_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, n_div_4_strided_a) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -49497,12 +49497,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, n_div_4_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, n_div_4_subtile) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 2; m++) {
@@ -49515,13 +49515,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, strided_cm_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, strided_cm_subtile) {
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 2; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -49535,13 +49535,13 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, qmin) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, qmin) {
     GemmMicrokernelTester()
       .mr(2)
       .nr(4)
@@ -49551,10 +49551,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, qmax) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, qmax) {
     GemmMicrokernelTester()
       .mr(2)
       .nr(4)
@@ -49564,10 +49564,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, strided_cm) {
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, strided_cm) {
     GemmMicrokernelTester()
       .mr(2)
       .nr(4)
@@ -49577,13 +49577,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_WASMSIMD
 
 
 #if XNN_ARCH_WASMSIMD
-  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, k_eq_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, k_eq_8) {
     GemmMicrokernelTester()
       .mr(3)
       .nr(4)
@@ -49592,10 +49592,10 @@
       .m(3)
       .n(4)
       .k(8)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, strided_cn) {
     GemmMicrokernelTester()
       .mr(3)
       .nr(4)
@@ -49605,10 +49605,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, k_eq_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, k_eq_8_strided_a) {
     GemmMicrokernelTester()
       .mr(3)
       .nr(4)
@@ -49618,10 +49618,10 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, k_eq_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, k_eq_8_subtile) {
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
         GemmMicrokernelTester()
@@ -49633,12 +49633,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, k_eq_8_subtile_m) {
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, k_eq_8_subtile_m) {
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
         .mr(3)
@@ -49649,11 +49649,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, k_eq_8_subtile_n) {
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, k_eq_8_subtile_n) {
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
         .mr(3)
@@ -49664,11 +49664,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, k_lt_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, k_lt_8) {
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
         .mr(3)
@@ -49678,11 +49678,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, k_lt_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, k_lt_8_strided_a) {
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
         .mr(3)
@@ -49693,11 +49693,11 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, k_lt_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, k_lt_8_subtile) {
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -49710,13 +49710,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, k_gt_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, k_gt_8) {
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
         .mr(3)
@@ -49726,11 +49726,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, k_gt_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, k_gt_8_strided_a) {
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
         .mr(3)
@@ -49741,11 +49741,11 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, k_gt_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, k_gt_8_subtile) {
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -49758,13 +49758,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, k_div_8) {
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, k_div_8) {
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
         .mr(3)
@@ -49774,11 +49774,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, k_div_8_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, k_div_8_strided_a) {
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
         .mr(3)
@@ -49789,11 +49789,11 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, k_div_8_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, k_div_8_subtile) {
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -49806,13 +49806,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, n_gt_4) {
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, n_gt_4) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -49823,12 +49823,12 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, n_gt_4_strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, n_gt_4_strided_cn) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -49840,12 +49840,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, n_gt_4_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, n_gt_4_strided_a) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -49857,12 +49857,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, n_gt_4_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, n_gt_4_subtile) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 3; m++) {
@@ -49875,13 +49875,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, n_div_4) {
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, n_div_4) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -49892,12 +49892,12 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, n_div_4_strided_cn) {
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, n_div_4_strided_cn) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -49909,12 +49909,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, n_div_4_strided_a) {
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, n_div_4_strided_a) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -49926,12 +49926,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, n_div_4_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, n_div_4_subtile) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 3; m++) {
@@ -49944,13 +49944,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, strided_cm_subtile) {
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, strided_cm_subtile) {
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -49964,13 +49964,13 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, qmin) {
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, qmin) {
     GemmMicrokernelTester()
       .mr(3)
       .nr(4)
@@ -49980,10 +49980,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, qmax) {
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, qmax) {
     GemmMicrokernelTester()
       .mr(3)
       .nr(4)
@@ -49993,10 +49993,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, strided_cm) {
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, strided_cm) {
     GemmMicrokernelTester()
       .mr(3)
       .nr(4)
@@ -50006,7 +50006,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_WASMSIMD
 
diff --git a/test/qc8-gemm-minmax-fp32.yaml b/test/qc8-gemm-minmax-fp32.yaml
index 79c040a..ef42575 100644
--- a/test/qc8-gemm-minmax-fp32.yaml
+++ b/test/qc8-gemm-minmax-fp32.yaml
@@ -315,22 +315,22 @@
 - name: xnn_qc8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx
   init: xnn_init_qs8_minmax_avx512_params
   k-block: 8
-- name: xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64
   init: xnn_init_qs8_minmax_wasmsimd_params
   k-block: 8
-- name: xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64
   init: xnn_init_qs8_minmax_wasmsimd_params
   k-block: 8
-- name: xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64
   init: xnn_init_qs8_minmax_wasmsimd_params
   k-block: 8
-- name: xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128
   init: xnn_init_qs8_minmax_wasmsimd_params
   k-block: 8
-- name: xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128
   init: xnn_init_qs8_minmax_wasmsimd_params
   k-block: 8
-- name: xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128
   init: xnn_init_qs8_minmax_wasmsimd_params
   k-block: 8
 - name: xnn_qc8_gemm_minmax_fp32_ukernel_1x2__scalar_lrint
diff --git a/test/qc8-igemm-minmax-fp32.cc b/test/qc8-igemm-minmax-fp32.cc
index 0be8e8c..8c96ac4 100644
--- a/test/qc8-igemm-minmax-fp32.cc
+++ b/test/qc8-igemm-minmax-fp32.cc
@@ -45419,7 +45419,7 @@
 
 
 #if XNN_ARCH_WASMSIMD
-  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, k_eq_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, k_eq_8) {
     GemmMicrokernelTester()
       .mr(1)
       .nr(4)
@@ -45428,10 +45428,10 @@
       .m(1)
       .n(4)
       .k(8)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, strided_cn) {
     GemmMicrokernelTester()
       .mr(1)
       .nr(4)
@@ -45441,10 +45441,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, k_eq_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, k_eq_8_subtile) {
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
         GemmMicrokernelTester()
@@ -45456,12 +45456,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, k_eq_8_subtile_m) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, k_eq_8_subtile_m) {
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
         .mr(1)
@@ -45472,11 +45472,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, k_eq_8_subtile_n) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, k_eq_8_subtile_n) {
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
         .mr(1)
@@ -45487,11 +45487,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, k_lt_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, k_lt_8) {
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
         .mr(1)
@@ -45501,11 +45501,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, k_lt_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, k_lt_8_subtile) {
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -45518,13 +45518,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, k_gt_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, k_gt_8) {
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
         .mr(1)
@@ -45534,11 +45534,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, k_gt_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, k_gt_8_subtile) {
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -45551,13 +45551,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, k_div_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, k_div_8) {
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
         .mr(1)
@@ -45567,11 +45567,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, k_div_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, k_div_8_subtile) {
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -45584,13 +45584,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, n_gt_4) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, n_gt_4) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -45601,12 +45601,12 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, n_gt_4_strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, n_gt_4_strided_cn) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -45618,12 +45618,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, n_gt_4_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, n_gt_4_subtile) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 1; m++) {
@@ -45636,13 +45636,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, n_div_4) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, n_div_4) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -45653,12 +45653,12 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, n_div_4_strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, n_div_4_strided_cn) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -45670,12 +45670,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, n_div_4_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, n_div_4_subtile) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 1; m++) {
@@ -45688,13 +45688,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, small_kernel) {
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(1)
@@ -45705,11 +45705,11 @@
         .n(4)
         .k(k)
         .ks(3)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, small_kernel_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, small_kernel_subtile) {
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -45723,13 +45723,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, n_gt_4_small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, n_gt_4_small_kernel) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -45741,12 +45741,12 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, n_div_4_small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, n_div_4_small_kernel) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -45758,12 +45758,12 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, strided_cm_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, strided_cm_subtile) {
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -45777,13 +45777,13 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, a_offset) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, a_offset) {
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(1)
@@ -45795,11 +45795,11 @@
         .k(k)
         .ks(3)
         .a_offset(43)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, zero) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, zero) {
     for (uint32_t mz = 0; mz < 1; mz++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -45813,12 +45813,12 @@
           .ks(3)
           .a_offset(43)
           .zero_index(mz)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, qmin) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, qmin) {
     GemmMicrokernelTester()
       .mr(1)
       .nr(4)
@@ -45828,10 +45828,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, qmax) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, qmax) {
     GemmMicrokernelTester()
       .mr(1)
       .nr(4)
@@ -45841,10 +45841,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, strided_cm) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, strided_cm) {
     GemmMicrokernelTester()
       .mr(1)
       .nr(4)
@@ -45854,13 +45854,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_WASMSIMD
 
 
 #if XNN_ARCH_WASMSIMD
-  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, k_eq_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, k_eq_8) {
     GemmMicrokernelTester()
       .mr(2)
       .nr(4)
@@ -45869,10 +45869,10 @@
       .m(2)
       .n(4)
       .k(8)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, strided_cn) {
     GemmMicrokernelTester()
       .mr(2)
       .nr(4)
@@ -45882,10 +45882,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, k_eq_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, k_eq_8_subtile) {
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
         GemmMicrokernelTester()
@@ -45897,12 +45897,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, k_eq_8_subtile_m) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, k_eq_8_subtile_m) {
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
         .mr(2)
@@ -45913,11 +45913,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, k_eq_8_subtile_n) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, k_eq_8_subtile_n) {
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
         .mr(2)
@@ -45928,11 +45928,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, k_lt_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, k_lt_8) {
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
         .mr(2)
@@ -45942,11 +45942,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, k_lt_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, k_lt_8_subtile) {
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -45959,13 +45959,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, k_gt_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, k_gt_8) {
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
         .mr(2)
@@ -45975,11 +45975,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, k_gt_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, k_gt_8_subtile) {
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -45992,13 +45992,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, k_div_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, k_div_8) {
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
         .mr(2)
@@ -46008,11 +46008,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, k_div_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, k_div_8_subtile) {
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 2; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -46025,13 +46025,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, n_gt_4) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, n_gt_4) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -46042,12 +46042,12 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, n_gt_4_strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, n_gt_4_strided_cn) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -46059,12 +46059,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, n_gt_4_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, n_gt_4_subtile) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 2; m++) {
@@ -46077,13 +46077,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, n_div_4) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, n_div_4) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -46094,12 +46094,12 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, n_div_4_strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, n_div_4_strided_cn) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -46111,12 +46111,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, n_div_4_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, n_div_4_subtile) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 2; m++) {
@@ -46129,13 +46129,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, small_kernel) {
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(2)
@@ -46146,11 +46146,11 @@
         .n(4)
         .k(k)
         .ks(3)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, small_kernel_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, small_kernel_subtile) {
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 2; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -46164,13 +46164,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, n_gt_4_small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, n_gt_4_small_kernel) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -46182,12 +46182,12 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, n_div_4_small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, n_div_4_small_kernel) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -46199,12 +46199,12 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, strided_cm_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, strided_cm_subtile) {
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 2; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -46218,13 +46218,13 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, a_offset) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, a_offset) {
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(2)
@@ -46236,11 +46236,11 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, zero) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, zero) {
     for (uint32_t mz = 0; mz < 2; mz++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -46254,12 +46254,12 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, qmin) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, qmin) {
     GemmMicrokernelTester()
       .mr(2)
       .nr(4)
@@ -46269,10 +46269,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, qmax) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, qmax) {
     GemmMicrokernelTester()
       .mr(2)
       .nr(4)
@@ -46282,10 +46282,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, strided_cm) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, strided_cm) {
     GemmMicrokernelTester()
       .mr(2)
       .nr(4)
@@ -46295,13 +46295,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_WASMSIMD
 
 
 #if XNN_ARCH_WASMSIMD
-  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, k_eq_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, k_eq_8) {
     GemmMicrokernelTester()
       .mr(3)
       .nr(4)
@@ -46310,10 +46310,10 @@
       .m(3)
       .n(4)
       .k(8)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, strided_cn) {
     GemmMicrokernelTester()
       .mr(3)
       .nr(4)
@@ -46323,10 +46323,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, k_eq_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, k_eq_8_subtile) {
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
         GemmMicrokernelTester()
@@ -46338,12 +46338,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, k_eq_8_subtile_m) {
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, k_eq_8_subtile_m) {
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
         .mr(3)
@@ -46354,11 +46354,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, k_eq_8_subtile_n) {
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, k_eq_8_subtile_n) {
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
         .mr(3)
@@ -46369,11 +46369,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, k_lt_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, k_lt_8) {
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
         .mr(3)
@@ -46383,11 +46383,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, k_lt_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, k_lt_8_subtile) {
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -46400,13 +46400,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, k_gt_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, k_gt_8) {
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
         .mr(3)
@@ -46416,11 +46416,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, k_gt_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, k_gt_8_subtile) {
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -46433,13 +46433,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, k_div_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, k_div_8) {
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
         .mr(3)
@@ -46449,11 +46449,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, k_div_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, k_div_8_subtile) {
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -46466,13 +46466,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, n_gt_4) {
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, n_gt_4) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -46483,12 +46483,12 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, n_gt_4_strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, n_gt_4_strided_cn) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -46500,12 +46500,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, n_gt_4_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, n_gt_4_subtile) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 3; m++) {
@@ -46518,13 +46518,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, n_div_4) {
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, n_div_4) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -46535,12 +46535,12 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, n_div_4_strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, n_div_4_strided_cn) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -46552,12 +46552,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, n_div_4_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, n_div_4_subtile) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 3; m++) {
@@ -46570,13 +46570,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, small_kernel) {
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(3)
@@ -46587,11 +46587,11 @@
         .n(4)
         .k(k)
         .ks(3)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, small_kernel_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, small_kernel_subtile) {
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -46605,13 +46605,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, n_gt_4_small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, n_gt_4_small_kernel) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -46623,12 +46623,12 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, n_div_4_small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, n_div_4_small_kernel) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -46640,12 +46640,12 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, strided_cm_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, strided_cm_subtile) {
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -46659,13 +46659,13 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, a_offset) {
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, a_offset) {
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(3)
@@ -46677,11 +46677,11 @@
         .k(k)
         .ks(3)
         .a_offset(127)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, zero) {
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, zero) {
     for (uint32_t mz = 0; mz < 3; mz++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -46695,12 +46695,12 @@
           .ks(3)
           .a_offset(127)
           .zero_index(mz)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, qmin) {
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, qmin) {
     GemmMicrokernelTester()
       .mr(3)
       .nr(4)
@@ -46710,10 +46710,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, qmax) {
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, qmax) {
     GemmMicrokernelTester()
       .mr(3)
       .nr(4)
@@ -46723,10 +46723,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, strided_cm) {
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, strided_cm) {
     GemmMicrokernelTester()
       .mr(3)
       .nr(4)
@@ -46736,13 +46736,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_WASMSIMD
 
 
 #if XNN_ARCH_WASMSIMD
-  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, k_eq_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, k_eq_8) {
     GemmMicrokernelTester()
       .mr(1)
       .nr(4)
@@ -46751,10 +46751,10 @@
       .m(1)
       .n(4)
       .k(8)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, strided_cn) {
     GemmMicrokernelTester()
       .mr(1)
       .nr(4)
@@ -46764,10 +46764,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, k_eq_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, k_eq_8_subtile) {
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
         GemmMicrokernelTester()
@@ -46779,12 +46779,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, k_eq_8_subtile_m) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, k_eq_8_subtile_m) {
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
         .mr(1)
@@ -46795,11 +46795,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, k_eq_8_subtile_n) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, k_eq_8_subtile_n) {
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
         .mr(1)
@@ -46810,11 +46810,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, k_lt_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, k_lt_8) {
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
         .mr(1)
@@ -46824,11 +46824,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, k_lt_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, k_lt_8_subtile) {
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -46841,13 +46841,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, k_gt_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, k_gt_8) {
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
         .mr(1)
@@ -46857,11 +46857,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, k_gt_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, k_gt_8_subtile) {
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -46874,13 +46874,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, k_div_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, k_div_8) {
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
         .mr(1)
@@ -46890,11 +46890,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, k_div_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, k_div_8_subtile) {
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -46907,13 +46907,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, n_gt_4) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, n_gt_4) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -46924,12 +46924,12 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, n_gt_4_strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, n_gt_4_strided_cn) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -46941,12 +46941,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, n_gt_4_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, n_gt_4_subtile) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 1; m++) {
@@ -46959,13 +46959,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, n_div_4) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, n_div_4) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -46976,12 +46976,12 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, n_div_4_strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, n_div_4_strided_cn) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -46993,12 +46993,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, n_div_4_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, n_div_4_subtile) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 1; m++) {
@@ -47011,13 +47011,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, small_kernel) {
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(1)
@@ -47028,11 +47028,11 @@
         .n(4)
         .k(k)
         .ks(3)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, small_kernel_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, small_kernel_subtile) {
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -47046,13 +47046,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, n_gt_4_small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, n_gt_4_small_kernel) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -47064,12 +47064,12 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, n_div_4_small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, n_div_4_small_kernel) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -47081,12 +47081,12 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, strided_cm_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, strided_cm_subtile) {
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -47100,13 +47100,13 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, a_offset) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, a_offset) {
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(1)
@@ -47118,11 +47118,11 @@
         .k(k)
         .ks(3)
         .a_offset(43)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, zero) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, zero) {
     for (uint32_t mz = 0; mz < 1; mz++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -47136,12 +47136,12 @@
           .ks(3)
           .a_offset(43)
           .zero_index(mz)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, qmin) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, qmin) {
     GemmMicrokernelTester()
       .mr(1)
       .nr(4)
@@ -47151,10 +47151,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, qmax) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, qmax) {
     GemmMicrokernelTester()
       .mr(1)
       .nr(4)
@@ -47164,10 +47164,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, strided_cm) {
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, strided_cm) {
     GemmMicrokernelTester()
       .mr(1)
       .nr(4)
@@ -47177,13 +47177,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_WASMSIMD
 
 
 #if XNN_ARCH_WASMSIMD
-  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, k_eq_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, k_eq_8) {
     GemmMicrokernelTester()
       .mr(2)
       .nr(4)
@@ -47192,10 +47192,10 @@
       .m(2)
       .n(4)
       .k(8)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, strided_cn) {
     GemmMicrokernelTester()
       .mr(2)
       .nr(4)
@@ -47205,10 +47205,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, k_eq_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, k_eq_8_subtile) {
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
         GemmMicrokernelTester()
@@ -47220,12 +47220,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, k_eq_8_subtile_m) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, k_eq_8_subtile_m) {
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
         .mr(2)
@@ -47236,11 +47236,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, k_eq_8_subtile_n) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, k_eq_8_subtile_n) {
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
         .mr(2)
@@ -47251,11 +47251,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, k_lt_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, k_lt_8) {
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
         .mr(2)
@@ -47265,11 +47265,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, k_lt_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, k_lt_8_subtile) {
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -47282,13 +47282,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, k_gt_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, k_gt_8) {
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
         .mr(2)
@@ -47298,11 +47298,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, k_gt_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, k_gt_8_subtile) {
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -47315,13 +47315,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, k_div_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, k_div_8) {
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
         .mr(2)
@@ -47331,11 +47331,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, k_div_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, k_div_8_subtile) {
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 2; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -47348,13 +47348,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, n_gt_4) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, n_gt_4) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -47365,12 +47365,12 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, n_gt_4_strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, n_gt_4_strided_cn) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -47382,12 +47382,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, n_gt_4_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, n_gt_4_subtile) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 2; m++) {
@@ -47400,13 +47400,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, n_div_4) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, n_div_4) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -47417,12 +47417,12 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, n_div_4_strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, n_div_4_strided_cn) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -47434,12 +47434,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, n_div_4_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, n_div_4_subtile) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 2; m++) {
@@ -47452,13 +47452,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, small_kernel) {
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(2)
@@ -47469,11 +47469,11 @@
         .n(4)
         .k(k)
         .ks(3)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, small_kernel_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, small_kernel_subtile) {
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 2; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -47487,13 +47487,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, n_gt_4_small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, n_gt_4_small_kernel) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -47505,12 +47505,12 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, n_div_4_small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, n_div_4_small_kernel) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -47522,12 +47522,12 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, strided_cm_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, strided_cm_subtile) {
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 2; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -47541,13 +47541,13 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, a_offset) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, a_offset) {
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(2)
@@ -47559,11 +47559,11 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, zero) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, zero) {
     for (uint32_t mz = 0; mz < 2; mz++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -47577,12 +47577,12 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, qmin) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, qmin) {
     GemmMicrokernelTester()
       .mr(2)
       .nr(4)
@@ -47592,10 +47592,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, qmax) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, qmax) {
     GemmMicrokernelTester()
       .mr(2)
       .nr(4)
@@ -47605,10 +47605,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, strided_cm) {
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, strided_cm) {
     GemmMicrokernelTester()
       .mr(2)
       .nr(4)
@@ -47618,13 +47618,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_WASMSIMD
 
 
 #if XNN_ARCH_WASMSIMD
-  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, k_eq_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, k_eq_8) {
     GemmMicrokernelTester()
       .mr(3)
       .nr(4)
@@ -47633,10 +47633,10 @@
       .m(3)
       .n(4)
       .k(8)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, strided_cn) {
     GemmMicrokernelTester()
       .mr(3)
       .nr(4)
@@ -47646,10 +47646,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, k_eq_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, k_eq_8_subtile) {
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
         GemmMicrokernelTester()
@@ -47661,12 +47661,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, k_eq_8_subtile_m) {
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, k_eq_8_subtile_m) {
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
         .mr(3)
@@ -47677,11 +47677,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, k_eq_8_subtile_n) {
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, k_eq_8_subtile_n) {
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
         .mr(3)
@@ -47692,11 +47692,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, k_lt_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, k_lt_8) {
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
         .mr(3)
@@ -47706,11 +47706,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, k_lt_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, k_lt_8_subtile) {
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -47723,13 +47723,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, k_gt_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, k_gt_8) {
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
         .mr(3)
@@ -47739,11 +47739,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, k_gt_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, k_gt_8_subtile) {
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -47756,13 +47756,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, k_div_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, k_div_8) {
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
         .mr(3)
@@ -47772,11 +47772,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, k_div_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, k_div_8_subtile) {
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -47789,13 +47789,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, n_gt_4) {
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, n_gt_4) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -47806,12 +47806,12 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, n_gt_4_strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, n_gt_4_strided_cn) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -47823,12 +47823,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, n_gt_4_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, n_gt_4_subtile) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 3; m++) {
@@ -47841,13 +47841,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, n_div_4) {
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, n_div_4) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -47858,12 +47858,12 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, n_div_4_strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, n_div_4_strided_cn) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -47875,12 +47875,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, n_div_4_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, n_div_4_subtile) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 3; m++) {
@@ -47893,13 +47893,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, small_kernel) {
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(3)
@@ -47910,11 +47910,11 @@
         .n(4)
         .k(k)
         .ks(3)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, small_kernel_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, small_kernel_subtile) {
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -47928,13 +47928,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, n_gt_4_small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, n_gt_4_small_kernel) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -47946,12 +47946,12 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, n_div_4_small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, n_div_4_small_kernel) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -47963,12 +47963,12 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, strided_cm_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, strided_cm_subtile) {
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -47982,13 +47982,13 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, a_offset) {
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, a_offset) {
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(3)
@@ -48000,11 +48000,11 @@
         .k(k)
         .ks(3)
         .a_offset(127)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, zero) {
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, zero) {
     for (uint32_t mz = 0; mz < 3; mz++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -48018,12 +48018,12 @@
           .ks(3)
           .a_offset(127)
           .zero_index(mz)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, qmin) {
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, qmin) {
     GemmMicrokernelTester()
       .mr(3)
       .nr(4)
@@ -48033,10 +48033,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, qmax) {
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, qmax) {
     GemmMicrokernelTester()
       .mr(3)
       .nr(4)
@@ -48046,10 +48046,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, strided_cm) {
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, strided_cm) {
     GemmMicrokernelTester()
       .mr(3)
       .nr(4)
@@ -48059,7 +48059,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_minmax_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_WASMSIMD
 
diff --git a/test/qc8-igemm-minmax-fp32.yaml b/test/qc8-igemm-minmax-fp32.yaml
index 9cc1be3..2af5dfa 100644
--- a/test/qc8-igemm-minmax-fp32.yaml
+++ b/test/qc8-igemm-minmax-fp32.yaml
@@ -294,22 +294,22 @@
 - name: xnn_qc8_igemm_minmax_fp32_ukernel_4x16c8__avx512skx
   init: xnn_init_qs8_minmax_avx512_params
   k-block: 8
-- name: xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64
   init: xnn_init_qs8_minmax_wasmsimd_params
   k-block: 8
-- name: xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64
   init: xnn_init_qs8_minmax_wasmsimd_params
   k-block: 8
-- name: xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64
   init: xnn_init_qs8_minmax_wasmsimd_params
   k-block: 8
-- name: xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128
   init: xnn_init_qs8_minmax_wasmsimd_params
   k-block: 8
-- name: xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128
   init: xnn_init_qs8_minmax_wasmsimd_params
   k-block: 8
-- name: xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128
   init: xnn_init_qs8_minmax_wasmsimd_params
   k-block: 8
 - name: xnn_qc8_igemm_minmax_fp32_ukernel_1x2__scalar_lrint
diff --git a/test/qs8-gemm-minmax-fp32.cc b/test/qs8-gemm-minmax-fp32.cc
index 07e847f..446efb5 100644
--- a/test/qs8-gemm-minmax-fp32.cc
+++ b/test/qs8-gemm-minmax-fp32.cc
@@ -61025,7 +61025,7 @@
 
 
 #if XNN_ARCH_WASMSIMD
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, k_eq_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, k_eq_8) {
     GemmMicrokernelTester()
       .mr(1)
       .nr(4)
@@ -61034,10 +61034,10 @@
       .m(1)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, strided_cn) {
     GemmMicrokernelTester()
       .mr(1)
       .nr(4)
@@ -61047,10 +61047,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, k_eq_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, k_eq_8_strided_a) {
     GemmMicrokernelTester()
       .mr(1)
       .nr(4)
@@ -61060,10 +61060,10 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, k_eq_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, k_eq_8_subtile) {
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
         GemmMicrokernelTester()
@@ -61075,12 +61075,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, k_eq_8_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, k_eq_8_subtile_m) {
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
         .mr(1)
@@ -61091,11 +61091,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, k_eq_8_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, k_eq_8_subtile_n) {
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
         .mr(1)
@@ -61106,11 +61106,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, k_lt_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, k_lt_8) {
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
         .mr(1)
@@ -61120,11 +61120,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, k_lt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, k_lt_8_strided_a) {
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
         .mr(1)
@@ -61135,11 +61135,11 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, k_lt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, k_lt_8_subtile) {
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -61152,13 +61152,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, k_gt_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, k_gt_8) {
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
         .mr(1)
@@ -61168,11 +61168,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, k_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, k_gt_8_strided_a) {
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
         .mr(1)
@@ -61183,11 +61183,11 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, k_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, k_gt_8_subtile) {
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -61200,13 +61200,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, k_div_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, k_div_8) {
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
         .mr(1)
@@ -61216,11 +61216,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, k_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, k_div_8_strided_a) {
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
         .mr(1)
@@ -61231,11 +61231,11 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, k_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, k_div_8_subtile) {
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -61248,13 +61248,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, n_gt_4) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, n_gt_4) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -61265,12 +61265,12 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, n_gt_4_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, n_gt_4_strided_cn) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -61282,12 +61282,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, n_gt_4_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, n_gt_4_strided_a) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -61299,12 +61299,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, n_gt_4_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, n_gt_4_subtile) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 1; m++) {
@@ -61317,13 +61317,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, n_div_4) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, n_div_4) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -61334,12 +61334,12 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, n_div_4_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, n_div_4_strided_cn) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -61351,12 +61351,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, n_div_4_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, n_div_4_strided_a) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -61368,12 +61368,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, n_div_4_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, n_div_4_subtile) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 1; m++) {
@@ -61386,13 +61386,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, strided_cm_subtile) {
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -61406,13 +61406,13 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, qmin) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, qmin) {
     GemmMicrokernelTester()
       .mr(1)
       .nr(4)
@@ -61422,10 +61422,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, qmax) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, qmax) {
     GemmMicrokernelTester()
       .mr(1)
       .nr(4)
@@ -61435,10 +61435,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, strided_cm) {
     GemmMicrokernelTester()
       .mr(1)
       .nr(4)
@@ -61448,13 +61448,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_WASMSIMD
 
 
 #if XNN_ARCH_WASMSIMD
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, k_eq_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, k_eq_8) {
     GemmMicrokernelTester()
       .mr(2)
       .nr(4)
@@ -61463,10 +61463,10 @@
       .m(2)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, strided_cn) {
     GemmMicrokernelTester()
       .mr(2)
       .nr(4)
@@ -61476,10 +61476,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, k_eq_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, k_eq_8_strided_a) {
     GemmMicrokernelTester()
       .mr(2)
       .nr(4)
@@ -61489,10 +61489,10 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, k_eq_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, k_eq_8_subtile) {
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
         GemmMicrokernelTester()
@@ -61504,12 +61504,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, k_eq_8_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, k_eq_8_subtile_m) {
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
         .mr(2)
@@ -61520,11 +61520,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, k_eq_8_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, k_eq_8_subtile_n) {
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
         .mr(2)
@@ -61535,11 +61535,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, k_lt_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, k_lt_8) {
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
         .mr(2)
@@ -61549,11 +61549,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, k_lt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, k_lt_8_strided_a) {
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
         .mr(2)
@@ -61564,11 +61564,11 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, k_lt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, k_lt_8_subtile) {
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -61581,13 +61581,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, k_gt_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, k_gt_8) {
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
         .mr(2)
@@ -61597,11 +61597,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, k_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, k_gt_8_strided_a) {
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
         .mr(2)
@@ -61612,11 +61612,11 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, k_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, k_gt_8_subtile) {
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -61629,13 +61629,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, k_div_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, k_div_8) {
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
         .mr(2)
@@ -61645,11 +61645,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, k_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, k_div_8_strided_a) {
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
         .mr(2)
@@ -61660,11 +61660,11 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, k_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, k_div_8_subtile) {
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 2; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -61677,13 +61677,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, n_gt_4) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, n_gt_4) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -61694,12 +61694,12 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, n_gt_4_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, n_gt_4_strided_cn) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -61711,12 +61711,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, n_gt_4_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, n_gt_4_strided_a) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -61728,12 +61728,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, n_gt_4_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, n_gt_4_subtile) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 2; m++) {
@@ -61746,13 +61746,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, n_div_4) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, n_div_4) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -61763,12 +61763,12 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, n_div_4_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, n_div_4_strided_cn) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -61780,12 +61780,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, n_div_4_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, n_div_4_strided_a) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -61797,12 +61797,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, n_div_4_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, n_div_4_subtile) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 2; m++) {
@@ -61815,13 +61815,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, strided_cm_subtile) {
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 2; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -61835,13 +61835,13 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, qmin) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, qmin) {
     GemmMicrokernelTester()
       .mr(2)
       .nr(4)
@@ -61851,10 +61851,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, qmax) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, qmax) {
     GemmMicrokernelTester()
       .mr(2)
       .nr(4)
@@ -61864,10 +61864,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, strided_cm) {
     GemmMicrokernelTester()
       .mr(2)
       .nr(4)
@@ -61877,13 +61877,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_WASMSIMD
 
 
 #if XNN_ARCH_WASMSIMD
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, k_eq_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, k_eq_8) {
     GemmMicrokernelTester()
       .mr(3)
       .nr(4)
@@ -61892,10 +61892,10 @@
       .m(3)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, strided_cn) {
     GemmMicrokernelTester()
       .mr(3)
       .nr(4)
@@ -61905,10 +61905,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, k_eq_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, k_eq_8_strided_a) {
     GemmMicrokernelTester()
       .mr(3)
       .nr(4)
@@ -61918,10 +61918,10 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, k_eq_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, k_eq_8_subtile) {
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
         GemmMicrokernelTester()
@@ -61933,12 +61933,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, k_eq_8_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, k_eq_8_subtile_m) {
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
         .mr(3)
@@ -61949,11 +61949,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, k_eq_8_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, k_eq_8_subtile_n) {
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
         .mr(3)
@@ -61964,11 +61964,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, k_lt_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, k_lt_8) {
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
         .mr(3)
@@ -61978,11 +61978,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, k_lt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, k_lt_8_strided_a) {
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
         .mr(3)
@@ -61993,11 +61993,11 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, k_lt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, k_lt_8_subtile) {
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -62010,13 +62010,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, k_gt_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, k_gt_8) {
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
         .mr(3)
@@ -62026,11 +62026,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, k_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, k_gt_8_strided_a) {
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
         .mr(3)
@@ -62041,11 +62041,11 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, k_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, k_gt_8_subtile) {
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -62058,13 +62058,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, k_div_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, k_div_8) {
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
         .mr(3)
@@ -62074,11 +62074,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, k_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, k_div_8_strided_a) {
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
         .mr(3)
@@ -62089,11 +62089,11 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, k_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, k_div_8_subtile) {
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -62106,13 +62106,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, n_gt_4) {
+  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, n_gt_4) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -62123,12 +62123,12 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, n_gt_4_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, n_gt_4_strided_cn) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -62140,12 +62140,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, n_gt_4_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, n_gt_4_strided_a) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -62157,12 +62157,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, n_gt_4_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, n_gt_4_subtile) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 3; m++) {
@@ -62175,13 +62175,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, n_div_4) {
+  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, n_div_4) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -62192,12 +62192,12 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, n_div_4_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, n_div_4_strided_cn) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -62209,12 +62209,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, n_div_4_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, n_div_4_strided_a) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -62226,12 +62226,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, n_div_4_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, n_div_4_subtile) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 3; m++) {
@@ -62244,13 +62244,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, strided_cm_subtile) {
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -62264,13 +62264,13 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, qmin) {
+  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, qmin) {
     GemmMicrokernelTester()
       .mr(3)
       .nr(4)
@@ -62280,10 +62280,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, qmax) {
+  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, qmax) {
     GemmMicrokernelTester()
       .mr(3)
       .nr(4)
@@ -62293,10 +62293,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, strided_cm) {
     GemmMicrokernelTester()
       .mr(3)
       .nr(4)
@@ -62306,13 +62306,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_WASMSIMD
 
 
 #if XNN_ARCH_WASMSIMD
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, k_eq_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, k_eq_8) {
     GemmMicrokernelTester()
       .mr(1)
       .nr(4)
@@ -62321,10 +62321,10 @@
       .m(1)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, strided_cn) {
     GemmMicrokernelTester()
       .mr(1)
       .nr(4)
@@ -62334,10 +62334,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, k_eq_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, k_eq_8_strided_a) {
     GemmMicrokernelTester()
       .mr(1)
       .nr(4)
@@ -62347,10 +62347,10 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, k_eq_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, k_eq_8_subtile) {
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
         GemmMicrokernelTester()
@@ -62362,12 +62362,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, k_eq_8_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, k_eq_8_subtile_m) {
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
         .mr(1)
@@ -62378,11 +62378,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, k_eq_8_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, k_eq_8_subtile_n) {
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
         .mr(1)
@@ -62393,11 +62393,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, k_lt_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, k_lt_8) {
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
         .mr(1)
@@ -62407,11 +62407,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, k_lt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, k_lt_8_strided_a) {
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
         .mr(1)
@@ -62422,11 +62422,11 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, k_lt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, k_lt_8_subtile) {
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -62439,13 +62439,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, k_gt_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, k_gt_8) {
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
         .mr(1)
@@ -62455,11 +62455,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, k_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, k_gt_8_strided_a) {
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
         .mr(1)
@@ -62470,11 +62470,11 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, k_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, k_gt_8_subtile) {
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -62487,13 +62487,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, k_div_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, k_div_8) {
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
         .mr(1)
@@ -62503,11 +62503,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, k_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, k_div_8_strided_a) {
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
         .mr(1)
@@ -62518,11 +62518,11 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, k_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, k_div_8_subtile) {
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -62535,13 +62535,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, n_gt_4) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, n_gt_4) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -62552,12 +62552,12 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, n_gt_4_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, n_gt_4_strided_cn) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -62569,12 +62569,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, n_gt_4_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, n_gt_4_strided_a) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -62586,12 +62586,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, n_gt_4_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, n_gt_4_subtile) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 1; m++) {
@@ -62604,13 +62604,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, n_div_4) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, n_div_4) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -62621,12 +62621,12 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, n_div_4_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, n_div_4_strided_cn) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -62638,12 +62638,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, n_div_4_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, n_div_4_strided_a) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -62655,12 +62655,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, n_div_4_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, n_div_4_subtile) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 1; m++) {
@@ -62673,13 +62673,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, strided_cm_subtile) {
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -62693,13 +62693,13 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, qmin) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, qmin) {
     GemmMicrokernelTester()
       .mr(1)
       .nr(4)
@@ -62709,10 +62709,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, qmax) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, qmax) {
     GemmMicrokernelTester()
       .mr(1)
       .nr(4)
@@ -62722,10 +62722,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, strided_cm) {
     GemmMicrokernelTester()
       .mr(1)
       .nr(4)
@@ -62735,13 +62735,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_WASMSIMD
 
 
 #if XNN_ARCH_WASMSIMD
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, k_eq_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, k_eq_8) {
     GemmMicrokernelTester()
       .mr(2)
       .nr(4)
@@ -62750,10 +62750,10 @@
       .m(2)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, strided_cn) {
     GemmMicrokernelTester()
       .mr(2)
       .nr(4)
@@ -62763,10 +62763,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, k_eq_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, k_eq_8_strided_a) {
     GemmMicrokernelTester()
       .mr(2)
       .nr(4)
@@ -62776,10 +62776,10 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, k_eq_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, k_eq_8_subtile) {
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
         GemmMicrokernelTester()
@@ -62791,12 +62791,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, k_eq_8_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, k_eq_8_subtile_m) {
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
         .mr(2)
@@ -62807,11 +62807,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, k_eq_8_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, k_eq_8_subtile_n) {
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
         .mr(2)
@@ -62822,11 +62822,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, k_lt_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, k_lt_8) {
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
         .mr(2)
@@ -62836,11 +62836,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, k_lt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, k_lt_8_strided_a) {
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
         .mr(2)
@@ -62851,11 +62851,11 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, k_lt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, k_lt_8_subtile) {
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -62868,13 +62868,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, k_gt_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, k_gt_8) {
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
         .mr(2)
@@ -62884,11 +62884,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, k_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, k_gt_8_strided_a) {
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
         .mr(2)
@@ -62899,11 +62899,11 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, k_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, k_gt_8_subtile) {
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -62916,13 +62916,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, k_div_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, k_div_8) {
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
         .mr(2)
@@ -62932,11 +62932,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, k_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, k_div_8_strided_a) {
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
         .mr(2)
@@ -62947,11 +62947,11 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, k_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, k_div_8_subtile) {
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 2; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -62964,13 +62964,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, n_gt_4) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, n_gt_4) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -62981,12 +62981,12 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, n_gt_4_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, n_gt_4_strided_cn) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -62998,12 +62998,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, n_gt_4_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, n_gt_4_strided_a) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -63015,12 +63015,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, n_gt_4_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, n_gt_4_subtile) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 2; m++) {
@@ -63033,13 +63033,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, n_div_4) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, n_div_4) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -63050,12 +63050,12 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, n_div_4_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, n_div_4_strided_cn) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -63067,12 +63067,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, n_div_4_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, n_div_4_strided_a) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -63084,12 +63084,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, n_div_4_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, n_div_4_subtile) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 2; m++) {
@@ -63102,13 +63102,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, strided_cm_subtile) {
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 2; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -63122,13 +63122,13 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, qmin) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, qmin) {
     GemmMicrokernelTester()
       .mr(2)
       .nr(4)
@@ -63138,10 +63138,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, qmax) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, qmax) {
     GemmMicrokernelTester()
       .mr(2)
       .nr(4)
@@ -63151,10 +63151,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, strided_cm) {
     GemmMicrokernelTester()
       .mr(2)
       .nr(4)
@@ -63164,13 +63164,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_WASMSIMD
 
 
 #if XNN_ARCH_WASMSIMD
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, k_eq_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, k_eq_8) {
     GemmMicrokernelTester()
       .mr(3)
       .nr(4)
@@ -63179,10 +63179,10 @@
       .m(3)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, strided_cn) {
     GemmMicrokernelTester()
       .mr(3)
       .nr(4)
@@ -63192,10 +63192,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, k_eq_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, k_eq_8_strided_a) {
     GemmMicrokernelTester()
       .mr(3)
       .nr(4)
@@ -63205,10 +63205,10 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, k_eq_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, k_eq_8_subtile) {
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
         GemmMicrokernelTester()
@@ -63220,12 +63220,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, k_eq_8_subtile_m) {
+  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, k_eq_8_subtile_m) {
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
         .mr(3)
@@ -63236,11 +63236,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, k_eq_8_subtile_n) {
+  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, k_eq_8_subtile_n) {
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
         .mr(3)
@@ -63251,11 +63251,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, k_lt_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, k_lt_8) {
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
         .mr(3)
@@ -63265,11 +63265,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, k_lt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, k_lt_8_strided_a) {
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
         .mr(3)
@@ -63280,11 +63280,11 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, k_lt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, k_lt_8_subtile) {
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -63297,13 +63297,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, k_gt_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, k_gt_8) {
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
         .mr(3)
@@ -63313,11 +63313,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, k_gt_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, k_gt_8_strided_a) {
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
         .mr(3)
@@ -63328,11 +63328,11 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, k_gt_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, k_gt_8_subtile) {
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -63345,13 +63345,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, k_div_8) {
+  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, k_div_8) {
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
         .mr(3)
@@ -63361,11 +63361,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, k_div_8_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, k_div_8_strided_a) {
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
         .mr(3)
@@ -63376,11 +63376,11 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, k_div_8_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, k_div_8_subtile) {
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -63393,13 +63393,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, n_gt_4) {
+  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, n_gt_4) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -63410,12 +63410,12 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, n_gt_4_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, n_gt_4_strided_cn) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -63427,12 +63427,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, n_gt_4_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, n_gt_4_strided_a) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -63444,12 +63444,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, n_gt_4_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, n_gt_4_subtile) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 3; m++) {
@@ -63462,13 +63462,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, n_div_4) {
+  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, n_div_4) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -63479,12 +63479,12 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, n_div_4_strided_cn) {
+  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, n_div_4_strided_cn) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -63496,12 +63496,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, n_div_4_strided_a) {
+  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, n_div_4_strided_a) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -63513,12 +63513,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, n_div_4_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, n_div_4_subtile) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 3; m++) {
@@ -63531,13 +63531,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, strided_cm_subtile) {
+  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, strided_cm_subtile) {
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -63551,13 +63551,13 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, qmin) {
+  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, qmin) {
     GemmMicrokernelTester()
       .mr(3)
       .nr(4)
@@ -63567,10 +63567,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, qmax) {
+  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, qmax) {
     GemmMicrokernelTester()
       .mr(3)
       .nr(4)
@@ -63580,10 +63580,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, strided_cm) {
+  TEST(QS8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, strided_cm) {
     GemmMicrokernelTester()
       .mr(3)
       .nr(4)
@@ -63593,13 +63593,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_WASMSIMD
 
 
 #if XNN_ARCH_WASMSIMD
-  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__WASMSIMD, k_eq_8) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__WASMSIMD_MUL16, k_eq_8) {
     GemmMicrokernelTester()
       .extended_weights(true)
       .mr(1)
@@ -63609,10 +63609,10 @@
       .m(1)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__WASMSIMD, strided_cn) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__WASMSIMD_MUL16, strided_cn) {
     GemmMicrokernelTester()
       .extended_weights(true)
       .mr(1)
@@ -63623,10 +63623,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__WASMSIMD, k_eq_8_strided_a) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__WASMSIMD_MUL16, k_eq_8_strided_a) {
     GemmMicrokernelTester()
       .extended_weights(true)
       .mr(1)
@@ -63637,10 +63637,10 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__WASMSIMD, k_eq_8_subtile) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__WASMSIMD_MUL16, k_eq_8_subtile) {
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
         GemmMicrokernelTester()
@@ -63653,12 +63653,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__WASMSIMD, k_eq_8_subtile_m) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__WASMSIMD_MUL16, k_eq_8_subtile_m) {
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
         .extended_weights(true)
@@ -63670,11 +63670,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__WASMSIMD, k_eq_8_subtile_n) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__WASMSIMD_MUL16, k_eq_8_subtile_n) {
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
         .extended_weights(true)
@@ -63686,11 +63686,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__WASMSIMD, k_lt_8) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__WASMSIMD_MUL16, k_lt_8) {
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
         .extended_weights(true)
@@ -63701,11 +63701,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__WASMSIMD, k_lt_8_strided_a) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__WASMSIMD_MUL16, k_lt_8_strided_a) {
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
         .extended_weights(true)
@@ -63717,11 +63717,11 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__WASMSIMD, k_lt_8_subtile) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__WASMSIMD_MUL16, k_lt_8_subtile) {
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -63735,13 +63735,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__WASMSIMD, k_gt_8) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__WASMSIMD_MUL16, k_gt_8) {
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
         .extended_weights(true)
@@ -63752,11 +63752,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__WASMSIMD, k_gt_8_strided_a) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__WASMSIMD_MUL16, k_gt_8_strided_a) {
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
         .extended_weights(true)
@@ -63768,11 +63768,11 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__WASMSIMD, k_gt_8_subtile) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__WASMSIMD_MUL16, k_gt_8_subtile) {
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -63786,13 +63786,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__WASMSIMD, k_div_8) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__WASMSIMD_MUL16, k_div_8) {
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
         .extended_weights(true)
@@ -63803,11 +63803,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__WASMSIMD, k_div_8_strided_a) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__WASMSIMD_MUL16, k_div_8_strided_a) {
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
         .extended_weights(true)
@@ -63819,11 +63819,11 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__WASMSIMD, k_div_8_subtile) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__WASMSIMD_MUL16, k_div_8_subtile) {
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -63837,13 +63837,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__WASMSIMD, n_gt_4) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__WASMSIMD_MUL16, n_gt_4) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -63855,12 +63855,12 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__WASMSIMD, n_gt_4_strided_cn) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__WASMSIMD_MUL16, n_gt_4_strided_cn) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -63873,12 +63873,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__WASMSIMD, n_gt_4_strided_a) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__WASMSIMD_MUL16, n_gt_4_strided_a) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -63891,12 +63891,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__WASMSIMD, n_gt_4_subtile) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__WASMSIMD_MUL16, n_gt_4_subtile) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 1; m++) {
@@ -63910,13 +63910,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__WASMSIMD, n_div_4) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__WASMSIMD_MUL16, n_div_4) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -63928,12 +63928,12 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__WASMSIMD, n_div_4_strided_cn) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__WASMSIMD_MUL16, n_div_4_strided_cn) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -63946,12 +63946,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__WASMSIMD, n_div_4_strided_a) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__WASMSIMD_MUL16, n_div_4_strided_a) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -63964,12 +63964,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__WASMSIMD, n_div_4_subtile) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__WASMSIMD_MUL16, n_div_4_subtile) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 1; m++) {
@@ -63983,13 +63983,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__WASMSIMD, strided_cm_subtile) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__WASMSIMD_MUL16, strided_cm_subtile) {
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -64004,13 +64004,13 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__WASMSIMD, strided_cm) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__WASMSIMD_MUL16, strided_cm) {
     GemmMicrokernelTester()
       .extended_weights(true)
       .mr(1)
@@ -64021,13 +64021,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_WASMSIMD
 
 
 #if XNN_ARCH_WASMSIMD
-  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__WASMSIMD, k_eq_8) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__WASMSIMD_MUL16, k_eq_8) {
     GemmMicrokernelTester()
       .extended_weights(true)
       .mr(2)
@@ -64037,10 +64037,10 @@
       .m(2)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__WASMSIMD, strided_cn) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__WASMSIMD_MUL16, strided_cn) {
     GemmMicrokernelTester()
       .extended_weights(true)
       .mr(2)
@@ -64051,10 +64051,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__WASMSIMD, k_eq_8_strided_a) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__WASMSIMD_MUL16, k_eq_8_strided_a) {
     GemmMicrokernelTester()
       .extended_weights(true)
       .mr(2)
@@ -64065,10 +64065,10 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__WASMSIMD, k_eq_8_subtile) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__WASMSIMD_MUL16, k_eq_8_subtile) {
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
         GemmMicrokernelTester()
@@ -64081,12 +64081,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__WASMSIMD, k_eq_8_subtile_m) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__WASMSIMD_MUL16, k_eq_8_subtile_m) {
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
         .extended_weights(true)
@@ -64098,11 +64098,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__WASMSIMD, k_eq_8_subtile_n) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__WASMSIMD_MUL16, k_eq_8_subtile_n) {
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
         .extended_weights(true)
@@ -64114,11 +64114,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__WASMSIMD, k_lt_8) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__WASMSIMD_MUL16, k_lt_8) {
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
         .extended_weights(true)
@@ -64129,11 +64129,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__WASMSIMD, k_lt_8_strided_a) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__WASMSIMD_MUL16, k_lt_8_strided_a) {
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
         .extended_weights(true)
@@ -64145,11 +64145,11 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__WASMSIMD, k_lt_8_subtile) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__WASMSIMD_MUL16, k_lt_8_subtile) {
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -64163,13 +64163,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__WASMSIMD, k_gt_8) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__WASMSIMD_MUL16, k_gt_8) {
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
         .extended_weights(true)
@@ -64180,11 +64180,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__WASMSIMD, k_gt_8_strided_a) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__WASMSIMD_MUL16, k_gt_8_strided_a) {
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
         .extended_weights(true)
@@ -64196,11 +64196,11 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__WASMSIMD, k_gt_8_subtile) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__WASMSIMD_MUL16, k_gt_8_subtile) {
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -64214,13 +64214,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__WASMSIMD, k_div_8) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__WASMSIMD_MUL16, k_div_8) {
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
         .extended_weights(true)
@@ -64231,11 +64231,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__WASMSIMD, k_div_8_strided_a) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__WASMSIMD_MUL16, k_div_8_strided_a) {
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
         .extended_weights(true)
@@ -64247,11 +64247,11 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__WASMSIMD, k_div_8_subtile) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__WASMSIMD_MUL16, k_div_8_subtile) {
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 2; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -64265,13 +64265,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__WASMSIMD, n_gt_4) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__WASMSIMD_MUL16, n_gt_4) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -64283,12 +64283,12 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__WASMSIMD, n_gt_4_strided_cn) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__WASMSIMD_MUL16, n_gt_4_strided_cn) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -64301,12 +64301,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__WASMSIMD, n_gt_4_strided_a) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__WASMSIMD_MUL16, n_gt_4_strided_a) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -64319,12 +64319,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__WASMSIMD, n_gt_4_subtile) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__WASMSIMD_MUL16, n_gt_4_subtile) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 2; m++) {
@@ -64338,13 +64338,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__WASMSIMD, n_div_4) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__WASMSIMD_MUL16, n_div_4) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -64356,12 +64356,12 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__WASMSIMD, n_div_4_strided_cn) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__WASMSIMD_MUL16, n_div_4_strided_cn) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -64374,12 +64374,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__WASMSIMD, n_div_4_strided_a) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__WASMSIMD_MUL16, n_div_4_strided_a) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -64392,12 +64392,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__WASMSIMD, n_div_4_subtile) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__WASMSIMD_MUL16, n_div_4_subtile) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 2; m++) {
@@ -64411,13 +64411,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__WASMSIMD, strided_cm_subtile) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__WASMSIMD_MUL16, strided_cm_subtile) {
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 2; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -64432,13 +64432,13 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__WASMSIMD, strided_cm) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__WASMSIMD_MUL16, strided_cm) {
     GemmMicrokernelTester()
       .extended_weights(true)
       .mr(2)
@@ -64449,13 +64449,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_WASMSIMD
 
 
 #if XNN_ARCH_WASMSIMD
-  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__WASMSIMD, k_eq_8) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__WASMSIMD_MUL16, k_eq_8) {
     GemmMicrokernelTester()
       .extended_weights(true)
       .mr(3)
@@ -64465,10 +64465,10 @@
       .m(3)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__WASMSIMD, strided_cn) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__WASMSIMD_MUL16, strided_cn) {
     GemmMicrokernelTester()
       .extended_weights(true)
       .mr(3)
@@ -64479,10 +64479,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__WASMSIMD, k_eq_8_strided_a) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__WASMSIMD_MUL16, k_eq_8_strided_a) {
     GemmMicrokernelTester()
       .extended_weights(true)
       .mr(3)
@@ -64493,10 +64493,10 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__WASMSIMD, k_eq_8_subtile) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__WASMSIMD_MUL16, k_eq_8_subtile) {
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
         GemmMicrokernelTester()
@@ -64509,12 +64509,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__WASMSIMD, k_eq_8_subtile_m) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__WASMSIMD_MUL16, k_eq_8_subtile_m) {
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
         .extended_weights(true)
@@ -64526,11 +64526,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__WASMSIMD, k_eq_8_subtile_n) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__WASMSIMD_MUL16, k_eq_8_subtile_n) {
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
         .extended_weights(true)
@@ -64542,11 +64542,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__WASMSIMD, k_lt_8) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__WASMSIMD_MUL16, k_lt_8) {
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
         .extended_weights(true)
@@ -64557,11 +64557,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__WASMSIMD, k_lt_8_strided_a) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__WASMSIMD_MUL16, k_lt_8_strided_a) {
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
         .extended_weights(true)
@@ -64573,11 +64573,11 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__WASMSIMD, k_lt_8_subtile) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__WASMSIMD_MUL16, k_lt_8_subtile) {
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -64591,13 +64591,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__WASMSIMD, k_gt_8) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__WASMSIMD_MUL16, k_gt_8) {
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
         .extended_weights(true)
@@ -64608,11 +64608,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__WASMSIMD, k_gt_8_strided_a) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__WASMSIMD_MUL16, k_gt_8_strided_a) {
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
         .extended_weights(true)
@@ -64624,11 +64624,11 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__WASMSIMD, k_gt_8_subtile) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__WASMSIMD_MUL16, k_gt_8_subtile) {
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -64642,13 +64642,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__WASMSIMD, k_div_8) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__WASMSIMD_MUL16, k_div_8) {
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
         .extended_weights(true)
@@ -64659,11 +64659,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__WASMSIMD, k_div_8_strided_a) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__WASMSIMD_MUL16, k_div_8_strided_a) {
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
         .extended_weights(true)
@@ -64675,11 +64675,11 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__WASMSIMD, k_div_8_subtile) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__WASMSIMD_MUL16, k_div_8_subtile) {
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -64693,13 +64693,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__WASMSIMD, n_gt_4) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__WASMSIMD_MUL16, n_gt_4) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -64711,12 +64711,12 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__WASMSIMD, n_gt_4_strided_cn) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__WASMSIMD_MUL16, n_gt_4_strided_cn) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -64729,12 +64729,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__WASMSIMD, n_gt_4_strided_a) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__WASMSIMD_MUL16, n_gt_4_strided_a) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -64747,12 +64747,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__WASMSIMD, n_gt_4_subtile) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__WASMSIMD_MUL16, n_gt_4_subtile) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 3; m++) {
@@ -64766,13 +64766,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__WASMSIMD, n_div_4) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__WASMSIMD_MUL16, n_div_4) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -64784,12 +64784,12 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__WASMSIMD, n_div_4_strided_cn) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__WASMSIMD_MUL16, n_div_4_strided_cn) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -64802,12 +64802,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__WASMSIMD, n_div_4_strided_a) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__WASMSIMD_MUL16, n_div_4_strided_a) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -64820,12 +64820,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__WASMSIMD, n_div_4_subtile) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__WASMSIMD_MUL16, n_div_4_subtile) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 3; m++) {
@@ -64839,13 +64839,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__WASMSIMD, strided_cm_subtile) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__WASMSIMD_MUL16, strided_cm_subtile) {
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -64860,13 +64860,13 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__WASMSIMD, strided_cm) {
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__WASMSIMD_MUL16, strided_cm) {
     GemmMicrokernelTester()
       .extended_weights(true)
       .mr(3)
@@ -64877,7 +64877,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__wasmsimd, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_WASMSIMD
 
diff --git a/test/qs8-gemm-minmax-fp32.yaml b/test/qs8-gemm-minmax-fp32.yaml
index 933a48f..7904179 100644
--- a/test/qs8-gemm-minmax-fp32.yaml
+++ b/test/qs8-gemm-minmax-fp32.yaml
@@ -405,31 +405,31 @@
 - name: xnn_qs8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx
   init: xnn_init_qs8_conv_minmax_fp32_avx512_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64
+- name: xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64
   init: xnn_init_qs8_conv_minmax_fp32_wasmsimd_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64
+- name: xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64
   init: xnn_init_qs8_conv_minmax_fp32_wasmsimd_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64
+- name: xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64
   init: xnn_init_qs8_conv_minmax_fp32_wasmsimd_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128
+- name: xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128
   init: xnn_init_qs8_conv_minmax_fp32_wasmsimd_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128
+- name: xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128
   init: xnn_init_qs8_conv_minmax_fp32_wasmsimd_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128
+- name: xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128
   init: xnn_init_qs8_conv_minmax_fp32_wasmsimd_params
   k-block: 8
-- name: xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__wasmsimd
+- name: xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16
   init: xnn_init_qs8_conv_minmax_fp32_wasmsimd_params
   k-block: 8
-- name: xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__wasmsimd
+- name: xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16
   init: xnn_init_qs8_conv_minmax_fp32_wasmsimd_params
   k-block: 8
-- name: xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__wasmsimd
+- name: xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16
   init: xnn_init_qs8_conv_minmax_fp32_wasmsimd_params
   k-block: 8
 - name: xnn_qs8_gemm_minmax_fp32_ukernel_1x2__scalar_lrint
diff --git a/test/qs8-igemm-minmax-fp32.cc b/test/qs8-igemm-minmax-fp32.cc
index 078a8fa..b7369a7 100644
--- a/test/qs8-igemm-minmax-fp32.cc
+++ b/test/qs8-igemm-minmax-fp32.cc
@@ -44951,7 +44951,7 @@
 
 
 #if XNN_ARCH_WASMSIMD
-  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, k_eq_8) {
     GemmMicrokernelTester()
       .mr(1)
       .nr(4)
@@ -44960,10 +44960,10 @@
       .m(1)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, strided_cn) {
     GemmMicrokernelTester()
       .mr(1)
       .nr(4)
@@ -44973,10 +44973,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, k_eq_8_subtile) {
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
         GemmMicrokernelTester()
@@ -44988,12 +44988,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, k_eq_8_subtile_m) {
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
         .mr(1)
@@ -45004,11 +45004,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, k_eq_8_subtile_n) {
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
         .mr(1)
@@ -45019,11 +45019,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, k_lt_8) {
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
         .mr(1)
@@ -45033,11 +45033,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, k_lt_8_subtile) {
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -45050,13 +45050,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, k_gt_8) {
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
         .mr(1)
@@ -45066,11 +45066,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, k_gt_8_subtile) {
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -45083,13 +45083,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, k_div_8) {
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
         .mr(1)
@@ -45099,11 +45099,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, k_div_8_subtile) {
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -45116,13 +45116,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, n_gt_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, n_gt_4) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -45133,12 +45133,12 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, n_gt_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, n_gt_4_strided_cn) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -45150,12 +45150,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, n_gt_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, n_gt_4_subtile) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 1; m++) {
@@ -45168,13 +45168,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, n_div_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, n_div_4) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -45185,12 +45185,12 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, n_div_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, n_div_4_strided_cn) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -45202,12 +45202,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, n_div_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, n_div_4_subtile) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 1; m++) {
@@ -45220,13 +45220,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, small_kernel) {
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(1)
@@ -45237,11 +45237,11 @@
         .n(4)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, small_kernel_subtile) {
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -45255,13 +45255,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, n_gt_4_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, n_gt_4_small_kernel) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -45273,12 +45273,12 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, n_div_4_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, n_div_4_small_kernel) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -45290,12 +45290,12 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, strided_cm_subtile) {
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -45309,13 +45309,13 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, a_offset) {
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(1)
@@ -45327,11 +45327,11 @@
         .k(k)
         .ks(3)
         .a_offset(43)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, zero) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, zero) {
     for (uint32_t mz = 0; mz < 1; mz++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -45345,12 +45345,12 @@
           .ks(3)
           .a_offset(43)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, qmin) {
     GemmMicrokernelTester()
       .mr(1)
       .nr(4)
@@ -45360,10 +45360,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, qmax) {
     GemmMicrokernelTester()
       .mr(1)
       .nr(4)
@@ -45373,10 +45373,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD64, strided_cm) {
     GemmMicrokernelTester()
       .mr(1)
       .nr(4)
@@ -45386,13 +45386,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_WASMSIMD
 
 
 #if XNN_ARCH_WASMSIMD
-  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, k_eq_8) {
     GemmMicrokernelTester()
       .mr(2)
       .nr(4)
@@ -45401,10 +45401,10 @@
       .m(2)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, strided_cn) {
     GemmMicrokernelTester()
       .mr(2)
       .nr(4)
@@ -45414,10 +45414,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, k_eq_8_subtile) {
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
         GemmMicrokernelTester()
@@ -45429,12 +45429,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, k_eq_8_subtile_m) {
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
         .mr(2)
@@ -45445,11 +45445,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, k_eq_8_subtile_n) {
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
         .mr(2)
@@ -45460,11 +45460,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, k_lt_8) {
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
         .mr(2)
@@ -45474,11 +45474,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, k_lt_8_subtile) {
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -45491,13 +45491,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, k_gt_8) {
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
         .mr(2)
@@ -45507,11 +45507,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, k_gt_8_subtile) {
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -45524,13 +45524,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, k_div_8) {
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
         .mr(2)
@@ -45540,11 +45540,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, k_div_8_subtile) {
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 2; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -45557,13 +45557,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, n_gt_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, n_gt_4) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -45574,12 +45574,12 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, n_gt_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, n_gt_4_strided_cn) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -45591,12 +45591,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, n_gt_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, n_gt_4_subtile) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 2; m++) {
@@ -45609,13 +45609,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, n_div_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, n_div_4) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -45626,12 +45626,12 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, n_div_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, n_div_4_strided_cn) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -45643,12 +45643,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, n_div_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, n_div_4_subtile) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 2; m++) {
@@ -45661,13 +45661,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, small_kernel) {
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(2)
@@ -45678,11 +45678,11 @@
         .n(4)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, small_kernel_subtile) {
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 2; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -45696,13 +45696,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, n_gt_4_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, n_gt_4_small_kernel) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -45714,12 +45714,12 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, n_div_4_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, n_div_4_small_kernel) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -45731,12 +45731,12 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, strided_cm_subtile) {
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 2; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -45750,13 +45750,13 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, a_offset) {
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(2)
@@ -45768,11 +45768,11 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, zero) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, zero) {
     for (uint32_t mz = 0; mz < 2; mz++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -45786,12 +45786,12 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, qmin) {
     GemmMicrokernelTester()
       .mr(2)
       .nr(4)
@@ -45801,10 +45801,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, qmax) {
     GemmMicrokernelTester()
       .mr(2)
       .nr(4)
@@ -45814,10 +45814,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD64, strided_cm) {
     GemmMicrokernelTester()
       .mr(2)
       .nr(4)
@@ -45827,13 +45827,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_WASMSIMD
 
 
 #if XNN_ARCH_WASMSIMD
-  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, k_eq_8) {
     GemmMicrokernelTester()
       .mr(3)
       .nr(4)
@@ -45842,10 +45842,10 @@
       .m(3)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, strided_cn) {
     GemmMicrokernelTester()
       .mr(3)
       .nr(4)
@@ -45855,10 +45855,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, k_eq_8_subtile) {
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
         GemmMicrokernelTester()
@@ -45870,12 +45870,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, k_eq_8_subtile_m) {
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
         .mr(3)
@@ -45886,11 +45886,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, k_eq_8_subtile_n) {
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
         .mr(3)
@@ -45901,11 +45901,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, k_lt_8) {
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
         .mr(3)
@@ -45915,11 +45915,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, k_lt_8_subtile) {
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -45932,13 +45932,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, k_gt_8) {
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
         .mr(3)
@@ -45948,11 +45948,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, k_gt_8_subtile) {
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -45965,13 +45965,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, k_div_8) {
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
         .mr(3)
@@ -45981,11 +45981,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, k_div_8_subtile) {
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -45998,13 +45998,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, n_gt_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, n_gt_4) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -46015,12 +46015,12 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, n_gt_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, n_gt_4_strided_cn) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -46032,12 +46032,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, n_gt_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, n_gt_4_subtile) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 3; m++) {
@@ -46050,13 +46050,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, n_div_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, n_div_4) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -46067,12 +46067,12 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, n_div_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, n_div_4_strided_cn) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -46084,12 +46084,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, n_div_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, n_div_4_subtile) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 3; m++) {
@@ -46102,13 +46102,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, small_kernel) {
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(3)
@@ -46119,11 +46119,11 @@
         .n(4)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, small_kernel_subtile) {
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -46137,13 +46137,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, n_gt_4_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, n_gt_4_small_kernel) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -46155,12 +46155,12 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, n_div_4_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, n_div_4_small_kernel) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -46172,12 +46172,12 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, strided_cm_subtile) {
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -46191,13 +46191,13 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, a_offset) {
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(3)
@@ -46209,11 +46209,11 @@
         .k(k)
         .ks(3)
         .a_offset(127)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, zero) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, zero) {
     for (uint32_t mz = 0; mz < 3; mz++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -46227,12 +46227,12 @@
           .ks(3)
           .a_offset(127)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, qmin) {
     GemmMicrokernelTester()
       .mr(3)
       .nr(4)
@@ -46242,10 +46242,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, qmax) {
     GemmMicrokernelTester()
       .mr(3)
       .nr(4)
@@ -46255,10 +46255,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD64, strided_cm) {
     GemmMicrokernelTester()
       .mr(3)
       .nr(4)
@@ -46268,13 +46268,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_WASMSIMD
 
 
 #if XNN_ARCH_WASMSIMD
-  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, k_eq_8) {
     GemmMicrokernelTester()
       .mr(1)
       .nr(4)
@@ -46283,10 +46283,10 @@
       .m(1)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, strided_cn) {
     GemmMicrokernelTester()
       .mr(1)
       .nr(4)
@@ -46296,10 +46296,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, k_eq_8_subtile) {
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
         GemmMicrokernelTester()
@@ -46311,12 +46311,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, k_eq_8_subtile_m) {
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
         .mr(1)
@@ -46327,11 +46327,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, k_eq_8_subtile_n) {
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
         .mr(1)
@@ -46342,11 +46342,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, k_lt_8) {
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
         .mr(1)
@@ -46356,11 +46356,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, k_lt_8_subtile) {
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -46373,13 +46373,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, k_gt_8) {
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
         .mr(1)
@@ -46389,11 +46389,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, k_gt_8_subtile) {
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -46406,13 +46406,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, k_div_8) {
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
         .mr(1)
@@ -46422,11 +46422,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, k_div_8_subtile) {
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -46439,13 +46439,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, n_gt_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, n_gt_4) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -46456,12 +46456,12 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, n_gt_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, n_gt_4_strided_cn) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -46473,12 +46473,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, n_gt_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, n_gt_4_subtile) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 1; m++) {
@@ -46491,13 +46491,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, n_div_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, n_div_4) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -46508,12 +46508,12 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, n_div_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, n_div_4_strided_cn) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -46525,12 +46525,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, n_div_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, n_div_4_subtile) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 1; m++) {
@@ -46543,13 +46543,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, small_kernel) {
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(1)
@@ -46560,11 +46560,11 @@
         .n(4)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, small_kernel_subtile) {
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -46578,13 +46578,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, n_gt_4_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, n_gt_4_small_kernel) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -46596,12 +46596,12 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, n_div_4_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, n_div_4_small_kernel) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -46613,12 +46613,12 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, strided_cm_subtile) {
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -46632,13 +46632,13 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, a_offset) {
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(1)
@@ -46650,11 +46650,11 @@
         .k(k)
         .ks(3)
         .a_offset(43)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, zero) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, zero) {
     for (uint32_t mz = 0; mz < 1; mz++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -46668,12 +46668,12 @@
           .ks(3)
           .a_offset(43)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, qmin) {
     GemmMicrokernelTester()
       .mr(1)
       .nr(4)
@@ -46683,10 +46683,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, qmax) {
     GemmMicrokernelTester()
       .mr(1)
       .nr(4)
@@ -46696,10 +46696,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL16_LD128, strided_cm) {
     GemmMicrokernelTester()
       .mr(1)
       .nr(4)
@@ -46709,13 +46709,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_WASMSIMD
 
 
 #if XNN_ARCH_WASMSIMD
-  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, k_eq_8) {
     GemmMicrokernelTester()
       .mr(2)
       .nr(4)
@@ -46724,10 +46724,10 @@
       .m(2)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, strided_cn) {
     GemmMicrokernelTester()
       .mr(2)
       .nr(4)
@@ -46737,10 +46737,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, k_eq_8_subtile) {
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
         GemmMicrokernelTester()
@@ -46752,12 +46752,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, k_eq_8_subtile_m) {
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
         .mr(2)
@@ -46768,11 +46768,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, k_eq_8_subtile_n) {
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
         .mr(2)
@@ -46783,11 +46783,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, k_lt_8) {
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
         .mr(2)
@@ -46797,11 +46797,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, k_lt_8_subtile) {
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -46814,13 +46814,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, k_gt_8) {
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
         .mr(2)
@@ -46830,11 +46830,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, k_gt_8_subtile) {
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -46847,13 +46847,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, k_div_8) {
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
         .mr(2)
@@ -46863,11 +46863,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, k_div_8_subtile) {
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 2; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -46880,13 +46880,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, n_gt_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, n_gt_4) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -46897,12 +46897,12 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, n_gt_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, n_gt_4_strided_cn) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -46914,12 +46914,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, n_gt_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, n_gt_4_subtile) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 2; m++) {
@@ -46932,13 +46932,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, n_div_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, n_div_4) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -46949,12 +46949,12 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, n_div_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, n_div_4_strided_cn) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -46966,12 +46966,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, n_div_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, n_div_4_subtile) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 2; m++) {
@@ -46984,13 +46984,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, small_kernel) {
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(2)
@@ -47001,11 +47001,11 @@
         .n(4)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, small_kernel_subtile) {
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 2; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -47019,13 +47019,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, n_gt_4_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, n_gt_4_small_kernel) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -47037,12 +47037,12 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, n_div_4_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, n_div_4_small_kernel) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -47054,12 +47054,12 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, strided_cm_subtile) {
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 2; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -47073,13 +47073,13 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, a_offset) {
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(2)
@@ -47091,11 +47091,11 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, zero) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, zero) {
     for (uint32_t mz = 0; mz < 2; mz++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -47109,12 +47109,12 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, qmin) {
     GemmMicrokernelTester()
       .mr(2)
       .nr(4)
@@ -47124,10 +47124,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, qmax) {
     GemmMicrokernelTester()
       .mr(2)
       .nr(4)
@@ -47137,10 +47137,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL16_LD128, strided_cm) {
     GemmMicrokernelTester()
       .mr(2)
       .nr(4)
@@ -47150,13 +47150,13 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_WASMSIMD
 
 
 #if XNN_ARCH_WASMSIMD
-  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, k_eq_8) {
     GemmMicrokernelTester()
       .mr(3)
       .nr(4)
@@ -47165,10 +47165,10 @@
       .m(3)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, strided_cn) {
     GemmMicrokernelTester()
       .mr(3)
       .nr(4)
@@ -47178,10 +47178,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, k_eq_8_subtile) {
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
         GemmMicrokernelTester()
@@ -47193,12 +47193,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, k_eq_8_subtile_m) {
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
         .mr(3)
@@ -47209,11 +47209,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, k_eq_8_subtile_n) {
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
         .mr(3)
@@ -47224,11 +47224,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, k_lt_8) {
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
         .mr(3)
@@ -47238,11 +47238,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, k_lt_8_subtile) {
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -47255,13 +47255,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, k_gt_8) {
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
         .mr(3)
@@ -47271,11 +47271,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, k_gt_8_subtile) {
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -47288,13 +47288,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, k_div_8) {
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
         .mr(3)
@@ -47304,11 +47304,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, k_div_8_subtile) {
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -47321,13 +47321,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, n_gt_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, n_gt_4) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -47338,12 +47338,12 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, n_gt_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, n_gt_4_strided_cn) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -47355,12 +47355,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, n_gt_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, n_gt_4_subtile) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 3; m++) {
@@ -47373,13 +47373,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, n_div_4) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, n_div_4) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -47390,12 +47390,12 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, n_div_4_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, n_div_4_strided_cn) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -47407,12 +47407,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, n_div_4_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, n_div_4_subtile) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 3; m++) {
@@ -47425,13 +47425,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, small_kernel) {
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(3)
@@ -47442,11 +47442,11 @@
         .n(4)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, small_kernel_subtile) {
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -47460,13 +47460,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, n_gt_4_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, n_gt_4_small_kernel) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -47478,12 +47478,12 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, n_div_4_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, n_div_4_small_kernel) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -47495,12 +47495,12 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, strided_cm_subtile) {
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -47514,13 +47514,13 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, a_offset) {
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(3)
@@ -47532,11 +47532,11 @@
         .k(k)
         .ks(3)
         .a_offset(127)
-        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, zero) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, zero) {
     for (uint32_t mz = 0; mz < 3; mz++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -47550,12 +47550,12 @@
           .ks(3)
           .a_offset(127)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, qmin) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, qmin) {
     GemmMicrokernelTester()
       .mr(3)
       .nr(4)
@@ -47565,10 +47565,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, qmax) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, qmax) {
     GemmMicrokernelTester()
       .mr(3)
       .nr(4)
@@ -47578,10 +47578,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL16_LD128, strided_cm) {
     GemmMicrokernelTester()
       .mr(3)
       .nr(4)
@@ -47591,7 +47591,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_WASMSIMD
 
diff --git a/test/qs8-igemm-minmax-fp32.yaml b/test/qs8-igemm-minmax-fp32.yaml
index a8bb7a5..3e12b4b 100644
--- a/test/qs8-igemm-minmax-fp32.yaml
+++ b/test/qs8-igemm-minmax-fp32.yaml
@@ -291,22 +291,22 @@
 - name: xnn_qs8_igemm_minmax_fp32_ukernel_4x16c8__avx512skx
   init: xnn_init_qs8_conv_minmax_fp32_avx512_params
   k-block: 8
-- name: xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64
   init: xnn_init_qs8_conv_minmax_fp32_wasmsimd_params
   k-block: 8
-- name: xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64
   init: xnn_init_qs8_conv_minmax_fp32_wasmsimd_params
   k-block: 8
-- name: xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64
   init: xnn_init_qs8_conv_minmax_fp32_wasmsimd_params
   k-block: 8
-- name: xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld128
   init: xnn_init_qs8_conv_minmax_fp32_wasmsimd_params
   k-block: 8
-- name: xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128
   init: xnn_init_qs8_conv_minmax_fp32_wasmsimd_params
   k-block: 8
-- name: xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128
   init: xnn_init_qs8_conv_minmax_fp32_wasmsimd_params
   k-block: 8
 - name: xnn_qs8_igemm_minmax_fp32_ukernel_1x2__scalar_lrint
diff --git a/test/qu8-gemm-minmax-fp32.cc b/test/qu8-gemm-minmax-fp32.cc
index 98980dd..7a549fb 100644
--- a/test/qu8-gemm-minmax-fp32.cc
+++ b/test/qu8-gemm-minmax-fp32.cc
@@ -33858,7 +33858,7 @@
 
 
 #if XNN_ARCH_WASMSIMD
-  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, k_eq_8) {
+  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD64, k_eq_8) {
     GemmMicrokernelTester()
       .mr(1)
       .nr(4)
@@ -33867,10 +33867,10 @@
       .m(1)
       .n(4)
       .k(8)
-      .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, strided_cn) {
+  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD64, strided_cn) {
     GemmMicrokernelTester()
       .mr(1)
       .nr(4)
@@ -33880,10 +33880,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, k_eq_8_strided_a) {
+  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD64, k_eq_8_strided_a) {
     GemmMicrokernelTester()
       .mr(1)
       .nr(4)
@@ -33893,10 +33893,10 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, k_eq_8_subtile) {
+  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD64, k_eq_8_subtile) {
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
         GemmMicrokernelTester()
@@ -33908,12 +33908,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, k_eq_8_subtile_m) {
+  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD64, k_eq_8_subtile_m) {
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
         .mr(1)
@@ -33924,11 +33924,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, k_eq_8_subtile_n) {
+  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD64, k_eq_8_subtile_n) {
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
         .mr(1)
@@ -33939,11 +33939,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, k_lt_8) {
+  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD64, k_lt_8) {
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
         .mr(1)
@@ -33953,11 +33953,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, k_lt_8_strided_a) {
+  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD64, k_lt_8_strided_a) {
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
         .mr(1)
@@ -33968,11 +33968,11 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, k_lt_8_subtile) {
+  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD64, k_lt_8_subtile) {
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -33985,13 +33985,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, k_gt_8) {
+  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD64, k_gt_8) {
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
         .mr(1)
@@ -34001,11 +34001,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, k_gt_8_strided_a) {
+  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD64, k_gt_8_strided_a) {
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
         .mr(1)
@@ -34016,11 +34016,11 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, k_gt_8_subtile) {
+  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD64, k_gt_8_subtile) {
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -34033,13 +34033,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, k_div_8) {
+  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD64, k_div_8) {
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
         .mr(1)
@@ -34049,11 +34049,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, k_div_8_strided_a) {
+  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD64, k_div_8_strided_a) {
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
         .mr(1)
@@ -34064,11 +34064,11 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, k_div_8_subtile) {
+  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD64, k_div_8_subtile) {
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -34081,13 +34081,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, n_gt_4) {
+  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD64, n_gt_4) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -34098,12 +34098,12 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, n_gt_4_strided_cn) {
+  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD64, n_gt_4_strided_cn) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -34115,12 +34115,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, n_gt_4_strided_a) {
+  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD64, n_gt_4_strided_a) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -34132,12 +34132,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, n_gt_4_subtile) {
+  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD64, n_gt_4_subtile) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 1; m++) {
@@ -34150,13 +34150,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, n_div_4) {
+  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD64, n_div_4) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -34167,12 +34167,12 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, n_div_4_strided_cn) {
+  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD64, n_div_4_strided_cn) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -34184,12 +34184,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, n_div_4_strided_a) {
+  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD64, n_div_4_strided_a) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -34201,12 +34201,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, n_div_4_subtile) {
+  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD64, n_div_4_subtile) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 1; m++) {
@@ -34219,13 +34219,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, strided_cm_subtile) {
+  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD64, strided_cm_subtile) {
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -34239,13 +34239,13 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, qmin) {
+  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD64, qmin) {
     GemmMicrokernelTester()
       .mr(1)
       .nr(4)
@@ -34255,10 +34255,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, qmax) {
+  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD64, qmax) {
     GemmMicrokernelTester()
       .mr(1)
       .nr(4)
@@ -34268,10 +34268,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, strided_cm) {
+  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD64, strided_cm) {
     GemmMicrokernelTester()
       .mr(1)
       .nr(4)
@@ -34281,10 +34281,10 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, no_a_zero_point) {
+  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD64, no_a_zero_point) {
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(1)
@@ -34295,11 +34295,11 @@
         .n(4)
         .k(k)
         .a_zero_point(0)
-        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, no_b_zero_point) {
+  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD64, no_b_zero_point) {
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(1)
@@ -34310,11 +34310,11 @@
         .n(4)
         .k(k)
         .b_zero_point(0)
-        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, no_zero_point) {
+  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD64, no_zero_point) {
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(1)
@@ -34326,14 +34326,14 @@
         .k(k)
         .a_zero_point(0)
         .b_zero_point(0)
-        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 #endif  // XNN_ARCH_WASMSIMD
 
 
 #if XNN_ARCH_WASMSIMD
-  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, k_eq_8) {
+  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD64, k_eq_8) {
     GemmMicrokernelTester()
       .mr(2)
       .nr(4)
@@ -34342,10 +34342,10 @@
       .m(2)
       .n(4)
       .k(8)
-      .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, strided_cn) {
+  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD64, strided_cn) {
     GemmMicrokernelTester()
       .mr(2)
       .nr(4)
@@ -34355,10 +34355,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, k_eq_8_strided_a) {
+  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD64, k_eq_8_strided_a) {
     GemmMicrokernelTester()
       .mr(2)
       .nr(4)
@@ -34368,10 +34368,10 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, k_eq_8_subtile) {
+  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD64, k_eq_8_subtile) {
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
         GemmMicrokernelTester()
@@ -34383,12 +34383,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, k_eq_8_subtile_m) {
+  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD64, k_eq_8_subtile_m) {
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
         .mr(2)
@@ -34399,11 +34399,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, k_eq_8_subtile_n) {
+  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD64, k_eq_8_subtile_n) {
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
         .mr(2)
@@ -34414,11 +34414,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, k_lt_8) {
+  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD64, k_lt_8) {
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
         .mr(2)
@@ -34428,11 +34428,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, k_lt_8_strided_a) {
+  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD64, k_lt_8_strided_a) {
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
         .mr(2)
@@ -34443,11 +34443,11 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, k_lt_8_subtile) {
+  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD64, k_lt_8_subtile) {
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -34460,13 +34460,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, k_gt_8) {
+  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD64, k_gt_8) {
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
         .mr(2)
@@ -34476,11 +34476,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, k_gt_8_strided_a) {
+  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD64, k_gt_8_strided_a) {
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
         .mr(2)
@@ -34491,11 +34491,11 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, k_gt_8_subtile) {
+  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD64, k_gt_8_subtile) {
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -34508,13 +34508,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, k_div_8) {
+  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD64, k_div_8) {
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
         .mr(2)
@@ -34524,11 +34524,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, k_div_8_strided_a) {
+  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD64, k_div_8_strided_a) {
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
         .mr(2)
@@ -34539,11 +34539,11 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, k_div_8_subtile) {
+  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD64, k_div_8_subtile) {
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 2; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -34556,13 +34556,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, n_gt_4) {
+  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD64, n_gt_4) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -34573,12 +34573,12 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, n_gt_4_strided_cn) {
+  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD64, n_gt_4_strided_cn) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -34590,12 +34590,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, n_gt_4_strided_a) {
+  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD64, n_gt_4_strided_a) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -34607,12 +34607,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, n_gt_4_subtile) {
+  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD64, n_gt_4_subtile) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 2; m++) {
@@ -34625,13 +34625,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, n_div_4) {
+  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD64, n_div_4) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -34642,12 +34642,12 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, n_div_4_strided_cn) {
+  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD64, n_div_4_strided_cn) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -34659,12 +34659,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, n_div_4_strided_a) {
+  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD64, n_div_4_strided_a) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -34676,12 +34676,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, n_div_4_subtile) {
+  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD64, n_div_4_subtile) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 2; m++) {
@@ -34694,13 +34694,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, strided_cm_subtile) {
+  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD64, strided_cm_subtile) {
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 2; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -34714,13 +34714,13 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, qmin) {
+  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD64, qmin) {
     GemmMicrokernelTester()
       .mr(2)
       .nr(4)
@@ -34730,10 +34730,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, qmax) {
+  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD64, qmax) {
     GemmMicrokernelTester()
       .mr(2)
       .nr(4)
@@ -34743,10 +34743,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, strided_cm) {
+  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD64, strided_cm) {
     GemmMicrokernelTester()
       .mr(2)
       .nr(4)
@@ -34756,10 +34756,10 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, no_a_zero_point) {
+  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD64, no_a_zero_point) {
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(2)
@@ -34770,11 +34770,11 @@
         .n(4)
         .k(k)
         .a_zero_point(0)
-        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, no_b_zero_point) {
+  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD64, no_b_zero_point) {
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(2)
@@ -34785,11 +34785,11 @@
         .n(4)
         .k(k)
         .b_zero_point(0)
-        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, no_zero_point) {
+  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD64, no_zero_point) {
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(2)
@@ -34801,14 +34801,14 @@
         .k(k)
         .a_zero_point(0)
         .b_zero_point(0)
-        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 #endif  // XNN_ARCH_WASMSIMD
 
 
 #if XNN_ARCH_WASMSIMD
-  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, k_eq_8) {
+  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD64, k_eq_8) {
     GemmMicrokernelTester()
       .mr(3)
       .nr(4)
@@ -34817,10 +34817,10 @@
       .m(3)
       .n(4)
       .k(8)
-      .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, strided_cn) {
+  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD64, strided_cn) {
     GemmMicrokernelTester()
       .mr(3)
       .nr(4)
@@ -34830,10 +34830,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, k_eq_8_strided_a) {
+  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD64, k_eq_8_strided_a) {
     GemmMicrokernelTester()
       .mr(3)
       .nr(4)
@@ -34843,10 +34843,10 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, k_eq_8_subtile) {
+  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD64, k_eq_8_subtile) {
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
         GemmMicrokernelTester()
@@ -34858,12 +34858,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, k_eq_8_subtile_m) {
+  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD64, k_eq_8_subtile_m) {
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
         .mr(3)
@@ -34874,11 +34874,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, k_eq_8_subtile_n) {
+  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD64, k_eq_8_subtile_n) {
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
         .mr(3)
@@ -34889,11 +34889,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, k_lt_8) {
+  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD64, k_lt_8) {
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
         .mr(3)
@@ -34903,11 +34903,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, k_lt_8_strided_a) {
+  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD64, k_lt_8_strided_a) {
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
         .mr(3)
@@ -34918,11 +34918,11 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, k_lt_8_subtile) {
+  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD64, k_lt_8_subtile) {
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -34935,13 +34935,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, k_gt_8) {
+  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD64, k_gt_8) {
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
         .mr(3)
@@ -34951,11 +34951,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, k_gt_8_strided_a) {
+  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD64, k_gt_8_strided_a) {
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
         .mr(3)
@@ -34966,11 +34966,11 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, k_gt_8_subtile) {
+  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD64, k_gt_8_subtile) {
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -34983,13 +34983,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, k_div_8) {
+  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD64, k_div_8) {
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
         .mr(3)
@@ -34999,11 +34999,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, k_div_8_strided_a) {
+  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD64, k_div_8_strided_a) {
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
         .mr(3)
@@ -35014,11 +35014,11 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, k_div_8_subtile) {
+  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD64, k_div_8_subtile) {
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -35031,13 +35031,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, n_gt_4) {
+  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD64, n_gt_4) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -35048,12 +35048,12 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, n_gt_4_strided_cn) {
+  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD64, n_gt_4_strided_cn) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -35065,12 +35065,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, n_gt_4_strided_a) {
+  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD64, n_gt_4_strided_a) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -35082,12 +35082,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, n_gt_4_subtile) {
+  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD64, n_gt_4_subtile) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 3; m++) {
@@ -35100,13 +35100,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, n_div_4) {
+  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD64, n_div_4) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -35117,12 +35117,12 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, n_div_4_strided_cn) {
+  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD64, n_div_4_strided_cn) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -35134,12 +35134,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, n_div_4_strided_a) {
+  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD64, n_div_4_strided_a) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -35151,12 +35151,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, n_div_4_subtile) {
+  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD64, n_div_4_subtile) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 3; m++) {
@@ -35169,13 +35169,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, strided_cm_subtile) {
+  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD64, strided_cm_subtile) {
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -35189,13 +35189,13 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, qmin) {
+  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD64, qmin) {
     GemmMicrokernelTester()
       .mr(3)
       .nr(4)
@@ -35205,10 +35205,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, qmax) {
+  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD64, qmax) {
     GemmMicrokernelTester()
       .mr(3)
       .nr(4)
@@ -35218,10 +35218,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, strided_cm) {
+  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD64, strided_cm) {
     GemmMicrokernelTester()
       .mr(3)
       .nr(4)
@@ -35231,10 +35231,10 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, no_a_zero_point) {
+  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD64, no_a_zero_point) {
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(3)
@@ -35245,11 +35245,11 @@
         .n(4)
         .k(k)
         .a_zero_point(0)
-        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, no_b_zero_point) {
+  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD64, no_b_zero_point) {
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(3)
@@ -35260,11 +35260,11 @@
         .n(4)
         .k(k)
         .b_zero_point(0)
-        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, no_zero_point) {
+  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD64, no_zero_point) {
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(3)
@@ -35276,14 +35276,14 @@
         .k(k)
         .a_zero_point(0)
         .b_zero_point(0)
-        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 #endif  // XNN_ARCH_WASMSIMD
 
 
 #if XNN_ARCH_WASMSIMD
-  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, k_eq_8) {
+  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD128, k_eq_8) {
     GemmMicrokernelTester()
       .mr(1)
       .nr(4)
@@ -35292,10 +35292,10 @@
       .m(1)
       .n(4)
       .k(8)
-      .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, strided_cn) {
+  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD128, strided_cn) {
     GemmMicrokernelTester()
       .mr(1)
       .nr(4)
@@ -35305,10 +35305,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, k_eq_8_strided_a) {
+  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD128, k_eq_8_strided_a) {
     GemmMicrokernelTester()
       .mr(1)
       .nr(4)
@@ -35318,10 +35318,10 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, k_eq_8_subtile) {
+  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD128, k_eq_8_subtile) {
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
         GemmMicrokernelTester()
@@ -35333,12 +35333,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, k_eq_8_subtile_m) {
+  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD128, k_eq_8_subtile_m) {
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
         .mr(1)
@@ -35349,11 +35349,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, k_eq_8_subtile_n) {
+  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD128, k_eq_8_subtile_n) {
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
         .mr(1)
@@ -35364,11 +35364,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, k_lt_8) {
+  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD128, k_lt_8) {
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
         .mr(1)
@@ -35378,11 +35378,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, k_lt_8_strided_a) {
+  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD128, k_lt_8_strided_a) {
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
         .mr(1)
@@ -35393,11 +35393,11 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, k_lt_8_subtile) {
+  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD128, k_lt_8_subtile) {
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -35410,13 +35410,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, k_gt_8) {
+  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD128, k_gt_8) {
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
         .mr(1)
@@ -35426,11 +35426,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, k_gt_8_strided_a) {
+  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD128, k_gt_8_strided_a) {
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
         .mr(1)
@@ -35441,11 +35441,11 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, k_gt_8_subtile) {
+  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD128, k_gt_8_subtile) {
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -35458,13 +35458,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, k_div_8) {
+  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD128, k_div_8) {
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
         .mr(1)
@@ -35474,11 +35474,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, k_div_8_strided_a) {
+  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD128, k_div_8_strided_a) {
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
         .mr(1)
@@ -35489,11 +35489,11 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, k_div_8_subtile) {
+  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD128, k_div_8_subtile) {
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -35506,13 +35506,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, n_gt_4) {
+  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD128, n_gt_4) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -35523,12 +35523,12 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, n_gt_4_strided_cn) {
+  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD128, n_gt_4_strided_cn) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -35540,12 +35540,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, n_gt_4_strided_a) {
+  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD128, n_gt_4_strided_a) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -35557,12 +35557,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, n_gt_4_subtile) {
+  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD128, n_gt_4_subtile) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 1; m++) {
@@ -35575,13 +35575,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, n_div_4) {
+  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD128, n_div_4) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -35592,12 +35592,12 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, n_div_4_strided_cn) {
+  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD128, n_div_4_strided_cn) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -35609,12 +35609,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, n_div_4_strided_a) {
+  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD128, n_div_4_strided_a) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -35626,12 +35626,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, n_div_4_subtile) {
+  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD128, n_div_4_subtile) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 1; m++) {
@@ -35644,13 +35644,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, strided_cm_subtile) {
+  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD128, strided_cm_subtile) {
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -35664,13 +35664,13 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, qmin) {
+  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD128, qmin) {
     GemmMicrokernelTester()
       .mr(1)
       .nr(4)
@@ -35680,10 +35680,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, qmax) {
+  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD128, qmax) {
     GemmMicrokernelTester()
       .mr(1)
       .nr(4)
@@ -35693,10 +35693,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, strided_cm) {
+  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD128, strided_cm) {
     GemmMicrokernelTester()
       .mr(1)
       .nr(4)
@@ -35706,10 +35706,10 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, no_a_zero_point) {
+  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD128, no_a_zero_point) {
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(1)
@@ -35720,11 +35720,11 @@
         .n(4)
         .k(k)
         .a_zero_point(0)
-        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, no_b_zero_point) {
+  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD128, no_b_zero_point) {
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(1)
@@ -35735,11 +35735,11 @@
         .n(4)
         .k(k)
         .b_zero_point(0)
-        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, no_zero_point) {
+  TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD128, no_zero_point) {
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(1)
@@ -35751,14 +35751,14 @@
         .k(k)
         .a_zero_point(0)
         .b_zero_point(0)
-        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 #endif  // XNN_ARCH_WASMSIMD
 
 
 #if XNN_ARCH_WASMSIMD
-  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, k_eq_8) {
+  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD128, k_eq_8) {
     GemmMicrokernelTester()
       .mr(2)
       .nr(4)
@@ -35767,10 +35767,10 @@
       .m(2)
       .n(4)
       .k(8)
-      .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, strided_cn) {
+  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD128, strided_cn) {
     GemmMicrokernelTester()
       .mr(2)
       .nr(4)
@@ -35780,10 +35780,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, k_eq_8_strided_a) {
+  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD128, k_eq_8_strided_a) {
     GemmMicrokernelTester()
       .mr(2)
       .nr(4)
@@ -35793,10 +35793,10 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, k_eq_8_subtile) {
+  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD128, k_eq_8_subtile) {
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
         GemmMicrokernelTester()
@@ -35808,12 +35808,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, k_eq_8_subtile_m) {
+  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD128, k_eq_8_subtile_m) {
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
         .mr(2)
@@ -35824,11 +35824,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, k_eq_8_subtile_n) {
+  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD128, k_eq_8_subtile_n) {
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
         .mr(2)
@@ -35839,11 +35839,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, k_lt_8) {
+  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD128, k_lt_8) {
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
         .mr(2)
@@ -35853,11 +35853,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, k_lt_8_strided_a) {
+  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD128, k_lt_8_strided_a) {
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
         .mr(2)
@@ -35868,11 +35868,11 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, k_lt_8_subtile) {
+  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD128, k_lt_8_subtile) {
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -35885,13 +35885,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, k_gt_8) {
+  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD128, k_gt_8) {
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
         .mr(2)
@@ -35901,11 +35901,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, k_gt_8_strided_a) {
+  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD128, k_gt_8_strided_a) {
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
         .mr(2)
@@ -35916,11 +35916,11 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, k_gt_8_subtile) {
+  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD128, k_gt_8_subtile) {
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -35933,13 +35933,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, k_div_8) {
+  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD128, k_div_8) {
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
         .mr(2)
@@ -35949,11 +35949,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, k_div_8_strided_a) {
+  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD128, k_div_8_strided_a) {
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
         .mr(2)
@@ -35964,11 +35964,11 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, k_div_8_subtile) {
+  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD128, k_div_8_subtile) {
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 2; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -35981,13 +35981,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, n_gt_4) {
+  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD128, n_gt_4) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -35998,12 +35998,12 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, n_gt_4_strided_cn) {
+  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD128, n_gt_4_strided_cn) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -36015,12 +36015,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, n_gt_4_strided_a) {
+  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD128, n_gt_4_strided_a) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -36032,12 +36032,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, n_gt_4_subtile) {
+  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD128, n_gt_4_subtile) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 2; m++) {
@@ -36050,13 +36050,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, n_div_4) {
+  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD128, n_div_4) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -36067,12 +36067,12 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, n_div_4_strided_cn) {
+  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD128, n_div_4_strided_cn) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -36084,12 +36084,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, n_div_4_strided_a) {
+  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD128, n_div_4_strided_a) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -36101,12 +36101,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, n_div_4_subtile) {
+  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD128, n_div_4_subtile) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 2; m++) {
@@ -36119,13 +36119,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, strided_cm_subtile) {
+  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD128, strided_cm_subtile) {
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 2; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -36139,13 +36139,13 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, qmin) {
+  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD128, qmin) {
     GemmMicrokernelTester()
       .mr(2)
       .nr(4)
@@ -36155,10 +36155,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, qmax) {
+  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD128, qmax) {
     GemmMicrokernelTester()
       .mr(2)
       .nr(4)
@@ -36168,10 +36168,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, strided_cm) {
+  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD128, strided_cm) {
     GemmMicrokernelTester()
       .mr(2)
       .nr(4)
@@ -36181,10 +36181,10 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, no_a_zero_point) {
+  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD128, no_a_zero_point) {
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(2)
@@ -36195,11 +36195,11 @@
         .n(4)
         .k(k)
         .a_zero_point(0)
-        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, no_b_zero_point) {
+  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD128, no_b_zero_point) {
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(2)
@@ -36210,11 +36210,11 @@
         .n(4)
         .k(k)
         .b_zero_point(0)
-        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, no_zero_point) {
+  TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD128, no_zero_point) {
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(2)
@@ -36226,14 +36226,14 @@
         .k(k)
         .a_zero_point(0)
         .b_zero_point(0)
-        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 #endif  // XNN_ARCH_WASMSIMD
 
 
 #if XNN_ARCH_WASMSIMD
-  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, k_eq_8) {
+  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD128, k_eq_8) {
     GemmMicrokernelTester()
       .mr(3)
       .nr(4)
@@ -36242,10 +36242,10 @@
       .m(3)
       .n(4)
       .k(8)
-      .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, strided_cn) {
+  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD128, strided_cn) {
     GemmMicrokernelTester()
       .mr(3)
       .nr(4)
@@ -36255,10 +36255,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, k_eq_8_strided_a) {
+  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD128, k_eq_8_strided_a) {
     GemmMicrokernelTester()
       .mr(3)
       .nr(4)
@@ -36268,10 +36268,10 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, k_eq_8_subtile) {
+  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD128, k_eq_8_subtile) {
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
         GemmMicrokernelTester()
@@ -36283,12 +36283,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, k_eq_8_subtile_m) {
+  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD128, k_eq_8_subtile_m) {
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
         .mr(3)
@@ -36299,11 +36299,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, k_eq_8_subtile_n) {
+  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD128, k_eq_8_subtile_n) {
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
         .mr(3)
@@ -36314,11 +36314,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, k_lt_8) {
+  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD128, k_lt_8) {
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
         .mr(3)
@@ -36328,11 +36328,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, k_lt_8_strided_a) {
+  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD128, k_lt_8_strided_a) {
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
         .mr(3)
@@ -36343,11 +36343,11 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, k_lt_8_subtile) {
+  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD128, k_lt_8_subtile) {
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -36360,13 +36360,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, k_gt_8) {
+  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD128, k_gt_8) {
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
         .mr(3)
@@ -36376,11 +36376,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, k_gt_8_strided_a) {
+  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD128, k_gt_8_strided_a) {
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
         .mr(3)
@@ -36391,11 +36391,11 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, k_gt_8_subtile) {
+  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD128, k_gt_8_subtile) {
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -36408,13 +36408,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, k_div_8) {
+  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD128, k_div_8) {
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
         .mr(3)
@@ -36424,11 +36424,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, k_div_8_strided_a) {
+  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD128, k_div_8_strided_a) {
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
         .mr(3)
@@ -36439,11 +36439,11 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, k_div_8_subtile) {
+  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD128, k_div_8_subtile) {
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -36456,13 +36456,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, n_gt_4) {
+  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD128, n_gt_4) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -36473,12 +36473,12 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, n_gt_4_strided_cn) {
+  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD128, n_gt_4_strided_cn) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -36490,12 +36490,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, n_gt_4_strided_a) {
+  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD128, n_gt_4_strided_a) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -36507,12 +36507,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, n_gt_4_subtile) {
+  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD128, n_gt_4_subtile) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 3; m++) {
@@ -36525,13 +36525,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, n_div_4) {
+  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD128, n_div_4) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -36542,12 +36542,12 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, n_div_4_strided_cn) {
+  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD128, n_div_4_strided_cn) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -36559,12 +36559,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, n_div_4_strided_a) {
+  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD128, n_div_4_strided_a) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -36576,12 +36576,12 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, n_div_4_subtile) {
+  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD128, n_div_4_subtile) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 3; m++) {
@@ -36594,13 +36594,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, strided_cm_subtile) {
+  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD128, strided_cm_subtile) {
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -36614,13 +36614,13 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, qmin) {
+  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD128, qmin) {
     GemmMicrokernelTester()
       .mr(3)
       .nr(4)
@@ -36630,10 +36630,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, qmax) {
+  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD128, qmax) {
     GemmMicrokernelTester()
       .mr(3)
       .nr(4)
@@ -36643,10 +36643,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, strided_cm) {
+  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD128, strided_cm) {
     GemmMicrokernelTester()
       .mr(3)
       .nr(4)
@@ -36656,10 +36656,10 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, no_a_zero_point) {
+  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD128, no_a_zero_point) {
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(3)
@@ -36670,11 +36670,11 @@
         .n(4)
         .k(k)
         .a_zero_point(0)
-        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, no_b_zero_point) {
+  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD128, no_b_zero_point) {
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(3)
@@ -36685,11 +36685,11 @@
         .n(4)
         .k(k)
         .b_zero_point(0)
-        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, no_zero_point) {
+  TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD128, no_zero_point) {
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(3)
@@ -36701,7 +36701,7 @@
         .k(k)
         .a_zero_point(0)
         .b_zero_point(0)
-        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 #endif  // XNN_ARCH_WASMSIMD
diff --git a/test/qu8-gemm-minmax-fp32.yaml b/test/qu8-gemm-minmax-fp32.yaml
index 6da05cf..b72aeb5 100644
--- a/test/qu8-gemm-minmax-fp32.yaml
+++ b/test/qu8-gemm-minmax-fp32.yaml
@@ -204,22 +204,22 @@
 - name: xnn_qu8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx
   init: xnn_init_qu8_conv_minmax_fp32_avx512_params
   k-block: 8
-- name: xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64
+- name: xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64
   init: xnn_init_qu8_conv_minmax_fp32_wasmsimd_params
   k-block: 8
-- name: xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64
+- name: xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld64
   init: xnn_init_qu8_conv_minmax_fp32_wasmsimd_params
   k-block: 8
-- name: xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64
+- name: xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64
   init: xnn_init_qu8_conv_minmax_fp32_wasmsimd_params
   k-block: 8
-- name: xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128
+- name: xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld128
   init: xnn_init_qu8_conv_minmax_fp32_wasmsimd_params
   k-block: 8
-- name: xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128
+- name: xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld128
   init: xnn_init_qu8_conv_minmax_fp32_wasmsimd_params
   k-block: 8
-- name: xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128
+- name: xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld128
   init: xnn_init_qu8_conv_minmax_fp32_wasmsimd_params
   k-block: 8
 - name: xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_lrint
diff --git a/test/qu8-igemm-minmax-fp32.cc b/test/qu8-igemm-minmax-fp32.cc
index 6609fcb..ec0c8b5 100644
--- a/test/qu8-igemm-minmax-fp32.cc
+++ b/test/qu8-igemm-minmax-fp32.cc
@@ -34662,7 +34662,7 @@
 
 
 #if XNN_ARCH_WASMSIMD
-  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, k_eq_8) {
+  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD64, k_eq_8) {
     GemmMicrokernelTester()
       .mr(1)
       .nr(4)
@@ -34671,10 +34671,10 @@
       .m(1)
       .n(4)
       .k(8)
-      .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, strided_cn) {
+  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD64, strided_cn) {
     GemmMicrokernelTester()
       .mr(1)
       .nr(4)
@@ -34684,10 +34684,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, k_eq_8_subtile) {
+  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD64, k_eq_8_subtile) {
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
         GemmMicrokernelTester()
@@ -34699,12 +34699,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, k_eq_8_subtile_m) {
+  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD64, k_eq_8_subtile_m) {
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
         .mr(1)
@@ -34715,11 +34715,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, k_eq_8_subtile_n) {
+  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD64, k_eq_8_subtile_n) {
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
         .mr(1)
@@ -34730,11 +34730,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, k_lt_8) {
+  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD64, k_lt_8) {
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
         .mr(1)
@@ -34744,11 +34744,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, k_lt_8_subtile) {
+  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD64, k_lt_8_subtile) {
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -34761,13 +34761,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, k_gt_8) {
+  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD64, k_gt_8) {
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
         .mr(1)
@@ -34777,11 +34777,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, k_gt_8_subtile) {
+  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD64, k_gt_8_subtile) {
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -34794,13 +34794,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, k_div_8) {
+  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD64, k_div_8) {
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
         .mr(1)
@@ -34810,11 +34810,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, k_div_8_subtile) {
+  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD64, k_div_8_subtile) {
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -34827,13 +34827,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, n_gt_4) {
+  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD64, n_gt_4) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -34844,12 +34844,12 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, n_gt_4_strided_cn) {
+  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD64, n_gt_4_strided_cn) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -34861,12 +34861,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, n_gt_4_subtile) {
+  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD64, n_gt_4_subtile) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 1; m++) {
@@ -34879,13 +34879,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, n_div_4) {
+  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD64, n_div_4) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -34896,12 +34896,12 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, n_div_4_strided_cn) {
+  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD64, n_div_4_strided_cn) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -34913,12 +34913,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, n_div_4_subtile) {
+  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD64, n_div_4_subtile) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 1; m++) {
@@ -34931,13 +34931,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, small_kernel) {
+  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD64, small_kernel) {
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(1)
@@ -34948,11 +34948,11 @@
         .n(4)
         .k(k)
         .ks(3)
-        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, small_kernel_subtile) {
+  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD64, small_kernel_subtile) {
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -34966,13 +34966,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, n_gt_4_small_kernel) {
+  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD64, n_gt_4_small_kernel) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -34984,12 +34984,12 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, n_div_4_small_kernel) {
+  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD64, n_div_4_small_kernel) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -35001,12 +35001,12 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, strided_cm_subtile) {
+  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD64, strided_cm_subtile) {
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -35020,13 +35020,13 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, a_offset) {
+  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD64, a_offset) {
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(1)
@@ -35038,11 +35038,11 @@
         .k(k)
         .ks(3)
         .a_offset(43)
-        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, zero) {
+  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD64, zero) {
     for (uint32_t mz = 0; mz < 1; mz++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -35056,12 +35056,12 @@
           .ks(3)
           .a_offset(43)
           .zero_index(mz)
-          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, qmin) {
+  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD64, qmin) {
     GemmMicrokernelTester()
       .mr(1)
       .nr(4)
@@ -35071,10 +35071,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, qmax) {
+  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD64, qmax) {
     GemmMicrokernelTester()
       .mr(1)
       .nr(4)
@@ -35084,10 +35084,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, strided_cm) {
+  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD64, strided_cm) {
     GemmMicrokernelTester()
       .mr(1)
       .nr(4)
@@ -35097,10 +35097,10 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, no_a_zero_point) {
+  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD64, no_a_zero_point) {
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(1)
@@ -35111,11 +35111,11 @@
         .n(4)
         .k(k)
         .a_zero_point(0)
-        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, no_b_zero_point) {
+  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD64, no_b_zero_point) {
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(1)
@@ -35126,11 +35126,11 @@
         .n(4)
         .k(k)
         .b_zero_point(0)
-        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD64, no_zero_point) {
+  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD64, no_zero_point) {
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(1)
@@ -35142,14 +35142,14 @@
         .k(k)
         .a_zero_point(0)
         .b_zero_point(0)
-        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 #endif  // XNN_ARCH_WASMSIMD
 
 
 #if XNN_ARCH_WASMSIMD
-  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, k_eq_8) {
+  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD64, k_eq_8) {
     GemmMicrokernelTester()
       .mr(2)
       .nr(4)
@@ -35158,10 +35158,10 @@
       .m(2)
       .n(4)
       .k(8)
-      .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, strided_cn) {
+  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD64, strided_cn) {
     GemmMicrokernelTester()
       .mr(2)
       .nr(4)
@@ -35171,10 +35171,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, k_eq_8_subtile) {
+  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD64, k_eq_8_subtile) {
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
         GemmMicrokernelTester()
@@ -35186,12 +35186,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, k_eq_8_subtile_m) {
+  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD64, k_eq_8_subtile_m) {
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
         .mr(2)
@@ -35202,11 +35202,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, k_eq_8_subtile_n) {
+  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD64, k_eq_8_subtile_n) {
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
         .mr(2)
@@ -35217,11 +35217,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, k_lt_8) {
+  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD64, k_lt_8) {
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
         .mr(2)
@@ -35231,11 +35231,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, k_lt_8_subtile) {
+  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD64, k_lt_8_subtile) {
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -35248,13 +35248,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, k_gt_8) {
+  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD64, k_gt_8) {
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
         .mr(2)
@@ -35264,11 +35264,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, k_gt_8_subtile) {
+  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD64, k_gt_8_subtile) {
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -35281,13 +35281,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, k_div_8) {
+  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD64, k_div_8) {
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
         .mr(2)
@@ -35297,11 +35297,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, k_div_8_subtile) {
+  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD64, k_div_8_subtile) {
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 2; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -35314,13 +35314,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, n_gt_4) {
+  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD64, n_gt_4) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -35331,12 +35331,12 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, n_gt_4_strided_cn) {
+  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD64, n_gt_4_strided_cn) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -35348,12 +35348,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, n_gt_4_subtile) {
+  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD64, n_gt_4_subtile) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 2; m++) {
@@ -35366,13 +35366,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, n_div_4) {
+  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD64, n_div_4) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -35383,12 +35383,12 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, n_div_4_strided_cn) {
+  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD64, n_div_4_strided_cn) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -35400,12 +35400,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, n_div_4_subtile) {
+  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD64, n_div_4_subtile) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 2; m++) {
@@ -35418,13 +35418,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, small_kernel) {
+  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD64, small_kernel) {
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(2)
@@ -35435,11 +35435,11 @@
         .n(4)
         .k(k)
         .ks(3)
-        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, small_kernel_subtile) {
+  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD64, small_kernel_subtile) {
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 2; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -35453,13 +35453,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, n_gt_4_small_kernel) {
+  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD64, n_gt_4_small_kernel) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -35471,12 +35471,12 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, n_div_4_small_kernel) {
+  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD64, n_div_4_small_kernel) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -35488,12 +35488,12 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, strided_cm_subtile) {
+  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD64, strided_cm_subtile) {
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 2; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -35507,13 +35507,13 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, a_offset) {
+  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD64, a_offset) {
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(2)
@@ -35525,11 +35525,11 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, zero) {
+  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD64, zero) {
     for (uint32_t mz = 0; mz < 2; mz++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -35543,12 +35543,12 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, qmin) {
+  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD64, qmin) {
     GemmMicrokernelTester()
       .mr(2)
       .nr(4)
@@ -35558,10 +35558,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, qmax) {
+  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD64, qmax) {
     GemmMicrokernelTester()
       .mr(2)
       .nr(4)
@@ -35571,10 +35571,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, strided_cm) {
+  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD64, strided_cm) {
     GemmMicrokernelTester()
       .mr(2)
       .nr(4)
@@ -35584,10 +35584,10 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, no_a_zero_point) {
+  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD64, no_a_zero_point) {
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(2)
@@ -35598,11 +35598,11 @@
         .n(4)
         .k(k)
         .a_zero_point(0)
-        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, no_b_zero_point) {
+  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD64, no_b_zero_point) {
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(2)
@@ -35613,11 +35613,11 @@
         .n(4)
         .k(k)
         .b_zero_point(0)
-        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD64, no_zero_point) {
+  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD64, no_zero_point) {
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(2)
@@ -35629,14 +35629,14 @@
         .k(k)
         .a_zero_point(0)
         .b_zero_point(0)
-        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 #endif  // XNN_ARCH_WASMSIMD
 
 
 #if XNN_ARCH_WASMSIMD
-  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, k_eq_8) {
+  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD64, k_eq_8) {
     GemmMicrokernelTester()
       .mr(3)
       .nr(4)
@@ -35645,10 +35645,10 @@
       .m(3)
       .n(4)
       .k(8)
-      .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, strided_cn) {
+  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD64, strided_cn) {
     GemmMicrokernelTester()
       .mr(3)
       .nr(4)
@@ -35658,10 +35658,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, k_eq_8_subtile) {
+  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD64, k_eq_8_subtile) {
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
         GemmMicrokernelTester()
@@ -35673,12 +35673,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, k_eq_8_subtile_m) {
+  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD64, k_eq_8_subtile_m) {
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
         .mr(3)
@@ -35689,11 +35689,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, k_eq_8_subtile_n) {
+  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD64, k_eq_8_subtile_n) {
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
         .mr(3)
@@ -35704,11 +35704,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, k_lt_8) {
+  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD64, k_lt_8) {
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
         .mr(3)
@@ -35718,11 +35718,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, k_lt_8_subtile) {
+  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD64, k_lt_8_subtile) {
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -35735,13 +35735,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, k_gt_8) {
+  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD64, k_gt_8) {
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
         .mr(3)
@@ -35751,11 +35751,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, k_gt_8_subtile) {
+  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD64, k_gt_8_subtile) {
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -35768,13 +35768,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, k_div_8) {
+  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD64, k_div_8) {
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
         .mr(3)
@@ -35784,11 +35784,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, k_div_8_subtile) {
+  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD64, k_div_8_subtile) {
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -35801,13 +35801,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, n_gt_4) {
+  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD64, n_gt_4) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -35818,12 +35818,12 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, n_gt_4_strided_cn) {
+  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD64, n_gt_4_strided_cn) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -35835,12 +35835,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, n_gt_4_subtile) {
+  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD64, n_gt_4_subtile) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 3; m++) {
@@ -35853,13 +35853,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, n_div_4) {
+  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD64, n_div_4) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -35870,12 +35870,12 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, n_div_4_strided_cn) {
+  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD64, n_div_4_strided_cn) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -35887,12 +35887,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, n_div_4_subtile) {
+  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD64, n_div_4_subtile) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 3; m++) {
@@ -35905,13 +35905,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, small_kernel) {
+  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD64, small_kernel) {
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(3)
@@ -35922,11 +35922,11 @@
         .n(4)
         .k(k)
         .ks(3)
-        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, small_kernel_subtile) {
+  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD64, small_kernel_subtile) {
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -35940,13 +35940,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, n_gt_4_small_kernel) {
+  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD64, n_gt_4_small_kernel) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -35958,12 +35958,12 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, n_div_4_small_kernel) {
+  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD64, n_div_4_small_kernel) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -35975,12 +35975,12 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, strided_cm_subtile) {
+  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD64, strided_cm_subtile) {
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -35994,13 +35994,13 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, a_offset) {
+  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD64, a_offset) {
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(3)
@@ -36012,11 +36012,11 @@
         .k(k)
         .ks(3)
         .a_offset(127)
-        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, zero) {
+  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD64, zero) {
     for (uint32_t mz = 0; mz < 3; mz++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -36030,12 +36030,12 @@
           .ks(3)
           .a_offset(127)
           .zero_index(mz)
-          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, qmin) {
+  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD64, qmin) {
     GemmMicrokernelTester()
       .mr(3)
       .nr(4)
@@ -36045,10 +36045,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, qmax) {
+  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD64, qmax) {
     GemmMicrokernelTester()
       .mr(3)
       .nr(4)
@@ -36058,10 +36058,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, strided_cm) {
+  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD64, strided_cm) {
     GemmMicrokernelTester()
       .mr(3)
       .nr(4)
@@ -36071,10 +36071,10 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, no_a_zero_point) {
+  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD64, no_a_zero_point) {
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(3)
@@ -36085,11 +36085,11 @@
         .n(4)
         .k(k)
         .a_zero_point(0)
-        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, no_b_zero_point) {
+  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD64, no_b_zero_point) {
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(3)
@@ -36100,11 +36100,11 @@
         .n(4)
         .k(k)
         .b_zero_point(0)
-        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD64, no_zero_point) {
+  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD64, no_zero_point) {
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(3)
@@ -36116,14 +36116,14 @@
         .k(k)
         .a_zero_point(0)
         .b_zero_point(0)
-        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 #endif  // XNN_ARCH_WASMSIMD
 
 
 #if XNN_ARCH_WASMSIMD
-  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, k_eq_8) {
+  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD128, k_eq_8) {
     GemmMicrokernelTester()
       .mr(1)
       .nr(4)
@@ -36132,10 +36132,10 @@
       .m(1)
       .n(4)
       .k(8)
-      .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, strided_cn) {
+  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD128, strided_cn) {
     GemmMicrokernelTester()
       .mr(1)
       .nr(4)
@@ -36145,10 +36145,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, k_eq_8_subtile) {
+  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD128, k_eq_8_subtile) {
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
         GemmMicrokernelTester()
@@ -36160,12 +36160,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, k_eq_8_subtile_m) {
+  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD128, k_eq_8_subtile_m) {
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
         .mr(1)
@@ -36176,11 +36176,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, k_eq_8_subtile_n) {
+  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD128, k_eq_8_subtile_n) {
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
         .mr(1)
@@ -36191,11 +36191,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, k_lt_8) {
+  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD128, k_lt_8) {
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
         .mr(1)
@@ -36205,11 +36205,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, k_lt_8_subtile) {
+  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD128, k_lt_8_subtile) {
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -36222,13 +36222,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, k_gt_8) {
+  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD128, k_gt_8) {
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
         .mr(1)
@@ -36238,11 +36238,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, k_gt_8_subtile) {
+  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD128, k_gt_8_subtile) {
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -36255,13 +36255,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, k_div_8) {
+  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD128, k_div_8) {
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
         .mr(1)
@@ -36271,11 +36271,11 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, k_div_8_subtile) {
+  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD128, k_div_8_subtile) {
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -36288,13 +36288,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, n_gt_4) {
+  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD128, n_gt_4) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -36305,12 +36305,12 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, n_gt_4_strided_cn) {
+  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD128, n_gt_4_strided_cn) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -36322,12 +36322,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, n_gt_4_subtile) {
+  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD128, n_gt_4_subtile) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 1; m++) {
@@ -36340,13 +36340,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, n_div_4) {
+  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD128, n_div_4) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -36357,12 +36357,12 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, n_div_4_strided_cn) {
+  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD128, n_div_4_strided_cn) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -36374,12 +36374,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, n_div_4_subtile) {
+  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD128, n_div_4_subtile) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 1; m++) {
@@ -36392,13 +36392,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, small_kernel) {
+  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD128, small_kernel) {
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(1)
@@ -36409,11 +36409,11 @@
         .n(4)
         .k(k)
         .ks(3)
-        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, small_kernel_subtile) {
+  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD128, small_kernel_subtile) {
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -36427,13 +36427,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, n_gt_4_small_kernel) {
+  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD128, n_gt_4_small_kernel) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -36445,12 +36445,12 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, n_div_4_small_kernel) {
+  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD128, n_div_4_small_kernel) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -36462,12 +36462,12 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, strided_cm_subtile) {
+  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD128, strided_cm_subtile) {
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -36481,13 +36481,13 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, a_offset) {
+  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD128, a_offset) {
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(1)
@@ -36499,11 +36499,11 @@
         .k(k)
         .ks(3)
         .a_offset(43)
-        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, zero) {
+  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD128, zero) {
     for (uint32_t mz = 0; mz < 1; mz++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -36517,12 +36517,12 @@
           .ks(3)
           .a_offset(43)
           .zero_index(mz)
-          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, qmin) {
+  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD128, qmin) {
     GemmMicrokernelTester()
       .mr(1)
       .nr(4)
@@ -36532,10 +36532,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, qmax) {
+  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD128, qmax) {
     GemmMicrokernelTester()
       .mr(1)
       .nr(4)
@@ -36545,10 +36545,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, strided_cm) {
+  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD128, strided_cm) {
     GemmMicrokernelTester()
       .mr(1)
       .nr(4)
@@ -36558,10 +36558,10 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, no_a_zero_point) {
+  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD128, no_a_zero_point) {
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(1)
@@ -36572,11 +36572,11 @@
         .n(4)
         .k(k)
         .a_zero_point(0)
-        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, no_b_zero_point) {
+  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD128, no_b_zero_point) {
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(1)
@@ -36587,11 +36587,11 @@
         .n(4)
         .k(k)
         .b_zero_point(0)
-        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_LD128, no_zero_point) {
+  TEST(QU8_IGEMM_MINMAX_FP32_1X4C8__WASMSIMD_MUL32_LD128, no_zero_point) {
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(1)
@@ -36603,14 +36603,14 @@
         .k(k)
         .a_zero_point(0)
         .b_zero_point(0)
-        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 #endif  // XNN_ARCH_WASMSIMD
 
 
 #if XNN_ARCH_WASMSIMD
-  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, k_eq_8) {
+  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD128, k_eq_8) {
     GemmMicrokernelTester()
       .mr(2)
       .nr(4)
@@ -36619,10 +36619,10 @@
       .m(2)
       .n(4)
       .k(8)
-      .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, strided_cn) {
+  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD128, strided_cn) {
     GemmMicrokernelTester()
       .mr(2)
       .nr(4)
@@ -36632,10 +36632,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, k_eq_8_subtile) {
+  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD128, k_eq_8_subtile) {
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
         GemmMicrokernelTester()
@@ -36647,12 +36647,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, k_eq_8_subtile_m) {
+  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD128, k_eq_8_subtile_m) {
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
         .mr(2)
@@ -36663,11 +36663,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, k_eq_8_subtile_n) {
+  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD128, k_eq_8_subtile_n) {
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
         .mr(2)
@@ -36678,11 +36678,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, k_lt_8) {
+  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD128, k_lt_8) {
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
         .mr(2)
@@ -36692,11 +36692,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, k_lt_8_subtile) {
+  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD128, k_lt_8_subtile) {
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -36709,13 +36709,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, k_gt_8) {
+  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD128, k_gt_8) {
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
         .mr(2)
@@ -36725,11 +36725,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, k_gt_8_subtile) {
+  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD128, k_gt_8_subtile) {
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -36742,13 +36742,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, k_div_8) {
+  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD128, k_div_8) {
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
         .mr(2)
@@ -36758,11 +36758,11 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, k_div_8_subtile) {
+  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD128, k_div_8_subtile) {
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 2; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -36775,13 +36775,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, n_gt_4) {
+  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD128, n_gt_4) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -36792,12 +36792,12 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, n_gt_4_strided_cn) {
+  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD128, n_gt_4_strided_cn) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -36809,12 +36809,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, n_gt_4_subtile) {
+  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD128, n_gt_4_subtile) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 2; m++) {
@@ -36827,13 +36827,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, n_div_4) {
+  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD128, n_div_4) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -36844,12 +36844,12 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, n_div_4_strided_cn) {
+  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD128, n_div_4_strided_cn) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -36861,12 +36861,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, n_div_4_subtile) {
+  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD128, n_div_4_subtile) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 2; m++) {
@@ -36879,13 +36879,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, small_kernel) {
+  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD128, small_kernel) {
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(2)
@@ -36896,11 +36896,11 @@
         .n(4)
         .k(k)
         .ks(3)
-        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, small_kernel_subtile) {
+  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD128, small_kernel_subtile) {
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 2; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -36914,13 +36914,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, n_gt_4_small_kernel) {
+  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD128, n_gt_4_small_kernel) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -36932,12 +36932,12 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, n_div_4_small_kernel) {
+  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD128, n_div_4_small_kernel) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -36949,12 +36949,12 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, strided_cm_subtile) {
+  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD128, strided_cm_subtile) {
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 2; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -36968,13 +36968,13 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, a_offset) {
+  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD128, a_offset) {
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(2)
@@ -36986,11 +36986,11 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, zero) {
+  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD128, zero) {
     for (uint32_t mz = 0; mz < 2; mz++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -37004,12 +37004,12 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, qmin) {
+  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD128, qmin) {
     GemmMicrokernelTester()
       .mr(2)
       .nr(4)
@@ -37019,10 +37019,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, qmax) {
+  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD128, qmax) {
     GemmMicrokernelTester()
       .mr(2)
       .nr(4)
@@ -37032,10 +37032,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, strided_cm) {
+  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD128, strided_cm) {
     GemmMicrokernelTester()
       .mr(2)
       .nr(4)
@@ -37045,10 +37045,10 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, no_a_zero_point) {
+  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD128, no_a_zero_point) {
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(2)
@@ -37059,11 +37059,11 @@
         .n(4)
         .k(k)
         .a_zero_point(0)
-        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, no_b_zero_point) {
+  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD128, no_b_zero_point) {
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(2)
@@ -37074,11 +37074,11 @@
         .n(4)
         .k(k)
         .b_zero_point(0)
-        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_LD128, no_zero_point) {
+  TEST(QU8_IGEMM_MINMAX_FP32_2X4C8__WASMSIMD_MUL32_LD128, no_zero_point) {
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(2)
@@ -37090,14 +37090,14 @@
         .k(k)
         .a_zero_point(0)
         .b_zero_point(0)
-        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 #endif  // XNN_ARCH_WASMSIMD
 
 
 #if XNN_ARCH_WASMSIMD
-  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, k_eq_8) {
+  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD128, k_eq_8) {
     GemmMicrokernelTester()
       .mr(3)
       .nr(4)
@@ -37106,10 +37106,10 @@
       .m(3)
       .n(4)
       .k(8)
-      .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, strided_cn) {
+  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD128, strided_cn) {
     GemmMicrokernelTester()
       .mr(3)
       .nr(4)
@@ -37119,10 +37119,10 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, k_eq_8_subtile) {
+  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD128, k_eq_8_subtile) {
     for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
         GemmMicrokernelTester()
@@ -37134,12 +37134,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, k_eq_8_subtile_m) {
+  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD128, k_eq_8_subtile_m) {
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
         .mr(3)
@@ -37150,11 +37150,11 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, k_eq_8_subtile_n) {
+  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD128, k_eq_8_subtile_n) {
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
         .mr(3)
@@ -37165,11 +37165,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, k_lt_8) {
+  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD128, k_lt_8) {
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
         .mr(3)
@@ -37179,11 +37179,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, k_lt_8_subtile) {
+  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD128, k_lt_8_subtile) {
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -37196,13 +37196,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, k_gt_8) {
+  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD128, k_gt_8) {
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
         .mr(3)
@@ -37212,11 +37212,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, k_gt_8_subtile) {
+  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD128, k_gt_8_subtile) {
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -37229,13 +37229,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, k_div_8) {
+  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD128, k_div_8) {
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
         .mr(3)
@@ -37245,11 +37245,11 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, k_div_8_subtile) {
+  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD128, k_div_8_subtile) {
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -37262,13 +37262,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, n_gt_4) {
+  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD128, n_gt_4) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -37279,12 +37279,12 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, n_gt_4_strided_cn) {
+  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD128, n_gt_4_strided_cn) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -37296,12 +37296,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, n_gt_4_subtile) {
+  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD128, n_gt_4_subtile) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 3; m++) {
@@ -37314,13 +37314,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, n_div_4) {
+  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD128, n_div_4) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -37331,12 +37331,12 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, n_div_4_strided_cn) {
+  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD128, n_div_4_strided_cn) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -37348,12 +37348,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, n_div_4_subtile) {
+  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD128, n_div_4_subtile) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 3; m++) {
@@ -37366,13 +37366,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, small_kernel) {
+  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD128, small_kernel) {
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(3)
@@ -37383,11 +37383,11 @@
         .n(4)
         .k(k)
         .ks(3)
-        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, small_kernel_subtile) {
+  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD128, small_kernel_subtile) {
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -37401,13 +37401,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, n_gt_4_small_kernel) {
+  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD128, n_gt_4_small_kernel) {
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -37419,12 +37419,12 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, n_div_4_small_kernel) {
+  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD128, n_div_4_small_kernel) {
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -37436,12 +37436,12 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, strided_cm_subtile) {
+  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD128, strided_cm_subtile) {
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 4; n++) {
@@ -37455,13 +37455,13 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+            .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, a_offset) {
+  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD128, a_offset) {
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(3)
@@ -37473,11 +37473,11 @@
         .k(k)
         .ks(3)
         .a_offset(127)
-        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, zero) {
+  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD128, zero) {
     for (uint32_t mz = 0; mz < 3; mz++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
@@ -37491,12 +37491,12 @@
           .ks(3)
           .a_offset(127)
           .zero_index(mz)
-          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+          .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
       }
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, qmin) {
+  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD128, qmin) {
     GemmMicrokernelTester()
       .mr(3)
       .nr(4)
@@ -37506,10 +37506,10 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, qmax) {
+  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD128, qmax) {
     GemmMicrokernelTester()
       .mr(3)
       .nr(4)
@@ -37519,10 +37519,10 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, strided_cm) {
+  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD128, strided_cm) {
     GemmMicrokernelTester()
       .mr(3)
       .nr(4)
@@ -37532,10 +37532,10 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+      .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, no_a_zero_point) {
+  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD128, no_a_zero_point) {
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(3)
@@ -37546,11 +37546,11 @@
         .n(4)
         .k(k)
         .a_zero_point(0)
-        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, no_b_zero_point) {
+  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD128, no_b_zero_point) {
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(3)
@@ -37561,11 +37561,11 @@
         .n(4)
         .k(k)
         .b_zero_point(0)
-        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 
-  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_LD128, no_zero_point) {
+  TEST(QU8_IGEMM_MINMAX_FP32_3X4C8__WASMSIMD_MUL32_LD128, no_zero_point) {
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(3)
@@ -37577,7 +37577,7 @@
         .k(k)
         .a_zero_point(0)
         .b_zero_point(0)
-        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+        .Test(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
     }
   }
 #endif  // XNN_ARCH_WASMSIMD
diff --git a/test/qu8-igemm-minmax-fp32.yaml b/test/qu8-igemm-minmax-fp32.yaml
index f19e9e1..c2b518c 100644
--- a/test/qu8-igemm-minmax-fp32.yaml
+++ b/test/qu8-igemm-minmax-fp32.yaml
@@ -204,22 +204,22 @@
 - name: xnn_qu8_igemm_minmax_fp32_ukernel_4x16c8__avx512skx
   init: xnn_init_qu8_conv_minmax_fp32_avx512_params
   k-block: 8
-- name: xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld64
+- name: xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64
   init: xnn_init_qu8_conv_minmax_fp32_wasmsimd_params
   k-block: 8
-- name: xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld64
+- name: xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld64
   init: xnn_init_qu8_conv_minmax_fp32_wasmsimd_params
   k-block: 8
-- name: xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld64
+- name: xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64
   init: xnn_init_qu8_conv_minmax_fp32_wasmsimd_params
   k-block: 8
-- name: xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_ld128
+- name: xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld128
   init: xnn_init_qu8_conv_minmax_fp32_wasmsimd_params
   k-block: 8
-- name: xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_ld128
+- name: xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld128
   init: xnn_init_qu8_conv_minmax_fp32_wasmsimd_params
   k-block: 8
-- name: xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_ld128
+- name: xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld128
   init: xnn_init_qu8_conv_minmax_fp32_wasmsimd_params
   k-block: 8
 - name: xnn_qu8_igemm_minmax_fp32_ukernel_1x2__scalar_lrint