Rename QS8 GEMM/IGEMM/DWCONV microkernels

Include requantization scheme in the microkernel name

PiperOrigin-RevId: 375612878
diff --git a/src/init.c b/src/init.c
index 14d2ca6..a952d30 100644
--- a/src/init.c
+++ b/src/init.c
@@ -110,30 +110,30 @@
       init_flags |= XNN_INIT_FLAG_QS8;
 
       if (!XNN_PLATFORM_IOS && cpuinfo_has_arm_neon_dot()) {
-        xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_4x8c4__neondot);
-        xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x8c4__neondot);
-        xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_4x8c4__neondot);
-        xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x8c4__neondot);
+        xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c4__neondot);
+        xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c4__neondot);
+        xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c4__neondot);
+        xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c4__neondot);
         xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_gemm_neon_params;
         xnn_params.qs8.gemm.mr = 4;
         xnn_params.qs8.gemm.nr = 8;
         xnn_params.qs8.gemm.log2_kr = 2;
       } else {
-        xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
-        xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
-        xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
-        xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+        xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_padal_dup);
+        xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_padal_dup);
+        xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_padal_dup);
+        xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_padal_dup);
         xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_gemm_neon_params;
         xnn_params.qs8.gemm.mr = 2;
         xnn_params.qs8.gemm.nr = 8;
         xnn_params.qs8.gemm.log2_kr = 1;
       }
 
-      xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_ukernel_up8x9__neon_mul16;
+      xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up8x9__neon_mul16;
       xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_gemm_neon_params;
       xnn_params.qs8.dwconv[0].channel_tile = 8;
       xnn_params.qs8.dwconv[0].primary_tile = 9;
-      xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_ukernel_up8x25__neon_mul16;
+      xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up8x25__neon_mul16;
       xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_gemm_neon_params;
       xnn_params.qs8.dwconv[1].channel_tile = 8;
       xnn_params.qs8.dwconv[1].primary_tile = 25;
@@ -841,19 +841,19 @@
     #if XNN_PLATFORM_IOS || XNN_PLATFORM_MAC
       #if XNN_ENABLE_ASSEMBLY
         if (cpuinfo_has_arm_neon_dot()) {
-          xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_ld64);
-          xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x16c4__neondot);
-          xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_ld64);
-          xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot);
+          xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64);
+          xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__neondot);
+          xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64);
+          xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c4__neondot);
           xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_gemm_neon_params;
           xnn_params.qs8.gemm.mr = 4;
           xnn_params.qs8.gemm.nr = 16;
           xnn_params.qs8.gemm.log2_kr = 2;
         } else {
-          xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal);
-          xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal);
-          xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x8c8__aarch64_neon_mlal_padal);
-          xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x8c8__aarch64_neon_mlal_padal);
+          xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal);
+          xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal);
+          xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal);
+          xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal);
           xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_gemm_neon_params;
           xnn_params.qs8.gemm.mr = 2;
           xnn_params.qs8.gemm.nr = 8;
@@ -861,19 +861,19 @@
         }
       #else  // !XNN_ENABLE_ASSEMBLY
         if (cpuinfo_has_arm_neon_dot()) {
-          xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot);
-          xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x16c4__neondot);
-          xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot);
-          xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot);
+          xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__neondot);
+          xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__neondot);
+          xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__neondot);
+          xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c4__neondot);
           xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_gemm_neon_params;
           xnn_params.qs8.gemm.mr = 4;
           xnn_params.qs8.gemm.nr = 16;
           xnn_params.qs8.gemm.log2_kr = 2;
         } else {
-          xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
-          xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
-          xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
-          xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+          xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_padal_dup);
+          xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_padal_dup);
+          xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_padal_dup);
+          xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_padal_dup);
           xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_gemm_neon_params;
           xnn_params.qs8.gemm.mr = 2;
           xnn_params.qs8.gemm.nr = 8;
@@ -885,16 +885,16 @@
         if (cpuinfo_has_arm_neon_dot()) {
           switch (cpuinfo_get_core(0)->uarch) {
             case cpuinfo_uarch_cortex_a55:
-              xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_cortex_a55);
-              xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_cortex_a55);
+              xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55);
+              xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55);
               break;
             default:
-              xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_ld64);
-              xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_ld64);
+              xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64);
+              xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64);
               break;
           }
-          xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x16c4__neondot);
-          xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot);
+          xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__neondot);
+          xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c4__neondot);
           xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_gemm_neon_params;
           xnn_params.qs8.gemm.mr = 4;
           xnn_params.qs8.gemm.nr = 16;
@@ -903,10 +903,10 @@
           switch (cpuinfo_get_core(0)->uarch) {
             case cpuinfo_uarch_cortex_a53:
             case cpuinfo_uarch_cortex_a55r0:
-              xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53);
-              xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53);
-              xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane);
-              xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x16__neon_mlal_lane);
+              xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53);
+              xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53);
+              xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16__neon_mlal_lane);
+              xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16__neon_mlal_lane);
               xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_gemm_neon_params;
               xnn_params.qs8.gemm.mr = 4;
               xnn_params.qs8.gemm.nr = 16;
@@ -915,10 +915,10 @@
             case cpuinfo_uarch_cortex_a72:
             case cpuinfo_uarch_cortex_a73:
             case cpuinfo_uarch_kryo:
-              xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm);
-              xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm);
-              xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm);
-              xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm);
+              xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm);
+              xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm);
+              xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm);
+              xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm);
               xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_gemm_neon_params;
               xnn_params.qs8.gemm.mr = 2;
               xnn_params.qs8.gemm.nr = 8;
@@ -926,10 +926,10 @@
               break;
 
             default:
-              xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal);
-              xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal);
-              xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x8c8__aarch64_neon_mlal_padal);
-              xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x8c8__aarch64_neon_mlal_padal);
+              xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal);
+              xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal);
+              xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal);
+              xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal);
               xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_gemm_neon_params;
               xnn_params.qs8.gemm.mr = 2;
               xnn_params.qs8.gemm.nr = 8;
@@ -954,19 +954,19 @@
               case cpuinfo_uarch_cortex_a53:
               case cpuinfo_uarch_cortex_a55r0:
                 if (mr == 2 && nr == 8 && log2_kr == 3) {
-                  xnn_params.qs8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53;
-                  xnn_params.qs8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53;
-                  xnn_params.qs8.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53;
-                  xnn_params.qs8.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53;
+                  xnn_params.qs8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53;
+                  xnn_params.qs8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53;
+                  xnn_params.qs8.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53;
+                  xnn_params.qs8.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53;
                 }
                 break;
 
               case cpuinfo_uarch_cortex_a55:
                 if (mr == 4 && nr == 16 && log2_kr == 2) {
-                  xnn_params.qs8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_cortex_a55;
-                  xnn_params.qs8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_cortex_a55;
-                  xnn_params.qs8.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x16c4__neondot;
-                  xnn_params.qs8.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot;
+                  xnn_params.qs8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55;
+                  xnn_params.qs8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55;
+                  xnn_params.qs8.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__neondot;
+                  xnn_params.qs8.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c4__neondot;
                 }
                 break;
               default:
@@ -977,19 +977,19 @@
         #endif  // XNN_MAX_UARCH_TYPES > 1
       #else  // !XNN_ENABLE_ASSEMBLY
         if (cpuinfo_has_arm_neon_dot()) {
-          xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot);
-          xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x16c4__neondot);
-          xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot);
-          xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot);
+          xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__neondot);
+          xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__neondot);
+          xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__neondot);
+          xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c4__neondot);
           xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_gemm_neon_params;
           xnn_params.qs8.gemm.mr = 4;
           xnn_params.qs8.gemm.nr = 16;
           xnn_params.qs8.gemm.log2_kr = 2;
         } else {
-          xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
-          xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
-          xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
-          xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+          xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_padal_dup);
+          xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_padal_dup);
+          xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_padal_dup);
+          xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_padal_dup);
           xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_gemm_neon_params;
           xnn_params.qs8.gemm.mr = 2;
           xnn_params.qs8.gemm.nr = 8;
@@ -998,11 +998,11 @@
       #endif  // XNN_ENABLE_ASSEMBLY
     #endif  // XNN_PLATFORM_IOS || XNN_PLATFORM_MAC
 
-    xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_ukernel_up8x9__neon_mul16;
+    xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up8x9__neon_mul16;
     xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_gemm_neon_params;
     xnn_params.qs8.dwconv[0].channel_tile = 8;
     xnn_params.qs8.dwconv[0].primary_tile = 9;
-    xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_ukernel_up8x25__neon_mul16;
+    xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up8x25__neon_mul16;
     xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_gemm_neon_params;
     xnn_params.qs8.dwconv[1].channel_tile = 8;
     xnn_params.qs8.dwconv[1].primary_tile = 25;
@@ -1560,65 +1560,65 @@
     init_flags |= XNN_INIT_FLAG_QS8;
 
     if (cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
-      xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx);
-      xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx);
-      xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x16c8__avx512skx);
-      xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x16c8__avx512skx);
+      xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__avx512skx);
+      xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__avx512skx);
+      xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__avx512skx);
+      xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__avx512skx);
       xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_gemm_sse4_params;
       xnn_params.qs8.gemm.mr = 4;
       xnn_params.qs8.gemm.nr = 16;
       xnn_params.qs8.gemm.log2_kr = 3;
     } else if (cpuinfo_has_x86_xop()) {
       // XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
-      xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld64);
-      xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld64);
-      xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld64);
-      xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld64);
+      xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c8__xop_ld64);
+      xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x4c8__xop_ld64);
+      xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x4c8__xop_ld64);
+      xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x4c8__xop_ld64);
       xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_gemm_sse4_params;
       xnn_params.qs8.gemm.mr = 2;
       xnn_params.qs8.gemm.nr = 4;
       xnn_params.qs8.gemm.log2_kr = 3;
     } else if (cpuinfo_has_x86_avx2()) {
-      xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2);
-      xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2);
-      xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2);
-      xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2);
+      xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__avx2);
+      xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__avx2);
+      xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__avx2);
+      xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__avx2);
       xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_gemm_avx2_params;
       xnn_params.qs8.gemm.mr = 3;
       xnn_params.qs8.gemm.nr = 8;
       xnn_params.qs8.gemm.log2_kr = 3;
     } else if (cpuinfo_has_x86_avx()) {
-      xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld128);
-      xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld128);
-      xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld128);
-      xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld128);
+      xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c8__avx_ld128);
+      xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x4c8__avx_ld128);
+      xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x4c8__avx_ld128);
+      xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x4c8__avx_ld128);
       xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_gemm_sse4_params;
       xnn_params.qs8.gemm.mr = 2;
       xnn_params.qs8.gemm.nr = 4;
       xnn_params.qs8.gemm.log2_kr = 3;
     } else if (cpuinfo_has_x86_sse4_1()) {
-      xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld64);
-      xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld64);
-      xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld64);
-      xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld64);
+      xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c8__sse41_ld64);
+      xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x4c8__sse41_ld64);
+      xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x4c8__sse41_ld64);
+      xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x4c8__sse41_ld64);
       xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_gemm_sse4_params;
       xnn_params.qs8.gemm.mr = 3;
       xnn_params.qs8.gemm.nr = 4;
       xnn_params.qs8.gemm.log2_kr = 3;
     } else if (cpuinfo_has_x86_ssse3()) {
-      xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_3x4c8__ssse3_ld64);
-      xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_3x4c8__ssse3_ld64);
-      xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x4c8__ssse3_ld64);
-      xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x4c8__ssse3_ld64);
+      xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c8__ssse3_ld64);
+      xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x4c8__ssse3_ld64);
+      xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x4c8__ssse3_ld64);
+      xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x4c8__ssse3_ld64);
       xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_gemm_sse2_params;
       xnn_params.qs8.gemm.mr = 3;
       xnn_params.qs8.gemm.nr = 4;
       xnn_params.qs8.gemm.log2_kr = 3;
     } else {
-      xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_3x4c8__sse2_ld64);
-      xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_3x4c8__sse2_ld64);
-      xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x4c8__sse2_ld64);
-      xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x4c8__sse2_ld64);
+      xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c8__sse2_ld64);
+      xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x4c8__sse2_ld64);
+      xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x4c8__sse2_ld64);
+      xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x4c8__sse2_ld64);
       xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_gemm_sse2_params;
       xnn_params.qs8.gemm.mr = 3;
       xnn_params.qs8.gemm.nr = 4;
@@ -1626,53 +1626,53 @@
     }
 
     if (cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
-      xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_ukernel_up32x9__avx512skx_mul32;
+      xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up32x9__avx512skx_mul32;
       xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_gemm_sse2_params;
       xnn_params.qs8.dwconv[0].channel_tile = 32;
-      xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_ukernel_up32x25__avx512skx_mul32;
+      xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up32x25__avx512skx_mul32;
       xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_gemm_sse2_params;
       xnn_params.qs8.dwconv[1].channel_tile = 32;
     } else if (cpuinfo_has_x86_xop()) {
       // XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
-      xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_ukernel_up16x9__xop_mul32;
+      xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up16x9__xop_mul32;
       xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_gemm_sse4_params;
       xnn_params.qs8.dwconv[0].channel_tile = 16;
-      xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_ukernel_up16x25__xop_mul32;
+      xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up16x25__xop_mul32;
       xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_gemm_sse4_params;
       xnn_params.qs8.dwconv[1].channel_tile = 16;
     } else if (cpuinfo_has_x86_avx2()) {
-      xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_ukernel_up16x9__avx2_mul32;
+      xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up16x9__avx2_mul32;
       xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_gemm_avx2_params;
       xnn_params.qs8.dwconv[0].channel_tile = 16;
-      xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_ukernel_up16x25__avx2_mul32;
+      xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up16x25__avx2_mul32;
       xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_gemm_avx2_params;
       xnn_params.qs8.dwconv[1].channel_tile = 16;
     } else if (cpuinfo_has_x86_avx()) {
-      xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_ukernel_up16x9__avx_mul32;
+      xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up16x9__avx_mul32;
       xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_gemm_sse4_params;
       xnn_params.qs8.dwconv[0].channel_tile = 16;
-      xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_ukernel_up16x25__avx_mul32;
+      xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up16x25__avx_mul32;
       xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_gemm_sse4_params;
       xnn_params.qs8.dwconv[1].channel_tile = 16;
     } else if (cpuinfo_has_x86_sse4_1()) {
-      xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul16;
+      xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up8x9__sse41_mul16;
       xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_gemm_sse4_params;
       xnn_params.qs8.dwconv[0].channel_tile = 8;
-      xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_ukernel_up8x25__sse41_mul16;
+      xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up8x25__sse41_mul16;
       xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_gemm_sse4_params;
       xnn_params.qs8.dwconv[1].channel_tile = 8;
     } else if (cpuinfo_has_x86_ssse3()) {
-      xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_ukernel_up8x9__ssse3_mul16;
+      xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up8x9__ssse3_mul16;
       xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_gemm_sse2_params;
       xnn_params.qs8.dwconv[0].channel_tile = 8;
-      xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_ukernel_up8x25__ssse3_mul16;
+      xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up8x25__ssse3_mul16;
       xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_gemm_sse2_params;
       xnn_params.qs8.dwconv[1].channel_tile = 8;
     } else if (cpuinfo_has_x86_sse2()) {
-      xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_ukernel_up8x9__sse2_mul16;
+      xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up8x9__sse2_mul16;
       xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_gemm_sse2_params;
       xnn_params.qs8.dwconv[0].channel_tile = 8;
-      xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_ukernel_up8x25__sse2_mul16;
+      xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up8x25__sse2_mul16;
       xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_gemm_sse2_params;
       xnn_params.qs8.dwconv[1].channel_tile = 8;
     }
@@ -2302,20 +2302,20 @@
   #ifndef XNN_NO_QS8_OPERATORS
     init_flags |= XNN_INIT_FLAG_QS8;
 
-    xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_3x4c8__wasmsimd_ld64);
-    xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_3x4c8__wasmsimd_ld64);
-    xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x4c8__wasmsimd_ld64);
-    xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x4c8__wasmsimd_ld64);
+    xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c8__wasmsimd_ld64);
+    xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x4c8__wasmsimd_ld64);
+    xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x4c8__wasmsimd_ld64);
+    xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x4c8__wasmsimd_ld64);
     xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_gemm_wasmsimd_params;
     xnn_params.qs8.gemm.mr = 3;
     xnn_params.qs8.gemm.nr = 4;
     xnn_params.qs8.gemm.log2_kr = 3;
 
-    xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_ukernel_up8x9__wasmsimd_mul16;
+    xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up8x9__wasmsimd_mul16;
     xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_gemm_wasmsimd_params;
     xnn_params.qs8.dwconv[0].channel_tile = 8;
     xnn_params.qs8.dwconv[0].primary_tile = 9;
-    xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_ukernel_up8x25__wasmsimd_mul16;
+    xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up8x25__wasmsimd_mul16;
     xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_gemm_wasmsimd_params;
     xnn_params.qs8.dwconv[1].channel_tile = 8;
     xnn_params.qs8.dwconv[1].primary_tile = 25;
@@ -2839,28 +2839,28 @@
     init_flags |= XNN_INIT_FLAG_QS8;
 
     if (is_wasm_x86) {
-      xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_2x2__scalar);
-      xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_2x2__scalar);
-      xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x2__scalar);
-      xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x2__scalar);
+      xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x2__scalar);
+      xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x2__scalar);
+      xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x2__scalar);
+      xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x2__scalar);
       xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_gemm_scalar_params;
       xnn_params.qs8.gemm.mr = 2;
       xnn_params.qs8.gemm.nr = 2;
     } else {
-      xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_4x4__scalar);
-      xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_4x4__scalar);
-      xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x4__scalar);
-      xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x4__scalar);
+      xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x4__scalar);
+      xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x4__scalar);
+      xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x4__scalar);
+      xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x4__scalar);
       xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_gemm_scalar_params;
       xnn_params.qs8.gemm.mr = 4;
       xnn_params.qs8.gemm.nr = 4;
     }
 
-    xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_ukernel_up2x9__scalar;
+    xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up2x9__scalar;
     xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_gemm_scalar_params;
     xnn_params.qs8.dwconv[0].channel_tile = 2;
     xnn_params.qs8.dwconv[0].primary_tile = 9;
-    xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_ukernel_up2x25__scalar;
+    xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up2x25__scalar;
     xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_gemm_scalar_params;
     xnn_params.qs8.dwconv[1].channel_tile = 2;
     xnn_params.qs8.dwconv[1].primary_tile = 25;
@@ -3225,19 +3225,19 @@
   #ifndef XNN_NO_QS8_OPERATORS
     init_flags |= XNN_INIT_FLAG_QS8;
 
-    xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_3x4__scalar);
-    xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_3x4__scalar);
-    xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x4__scalar);
-    xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x4__scalar);
+    xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4__scalar);
+    xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x4__scalar);
+    xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x4__scalar);
+    xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x4__scalar);
     xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_gemm_scalar_params;
     xnn_params.qs8.gemm.mr = 3;
     xnn_params.qs8.gemm.nr = 4;
 
-    xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_ukernel_up2x9__scalar;
+    xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up2x9__scalar;
     xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_gemm_scalar_params;
     xnn_params.qs8.dwconv[0].channel_tile = 2;
     xnn_params.qs8.dwconv[0].primary_tile = 9;
-    xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_ukernel_up2x25__scalar;
+    xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up2x25__scalar;
     xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_gemm_scalar_params;
     xnn_params.qs8.dwconv[1].channel_tile = 2;
     xnn_params.qs8.dwconv[1].primary_tile = 25;
diff --git a/src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-avx-mul16.c b/src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-avx-mul16.c
index 201e81c..9a333f4 100644
--- a/src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-avx-mul16.c
+++ b/src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-avx-mul16.c
@@ -14,7 +14,7 @@
 #include <xnnpack/dwconv.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up16x25__avx_mul16(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up16x25__avx_mul16(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-avx-mul32.c b/src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-avx-mul32.c
index 715c5ab..a067c31 100644
--- a/src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-avx-mul32.c
+++ b/src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-avx-mul32.c
@@ -15,7 +15,7 @@
 #include <xnnpack/intrinsics-polyfill.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up16x25__avx_mul32(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up16x25__avx_mul32(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-avx2-mul16.c b/src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-avx2-mul16.c
index 15c168f..713c60d 100644
--- a/src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-avx2-mul16.c
+++ b/src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-avx2-mul16.c
@@ -14,7 +14,7 @@
 #include <xnnpack/dwconv.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up16x25__avx2_mul16(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up16x25__avx2_mul16(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-avx2-mul32.c b/src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-avx2-mul32.c
index 37d3bcc..e0161bb 100644
--- a/src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-avx2-mul32.c
+++ b/src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-avx2-mul32.c
@@ -14,7 +14,7 @@
 #include <xnnpack/dwconv.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up16x25__avx2_mul32(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up16x25__avx2_mul32(
     size_t channels,
     size_t output_width,
     const int8_t** input,
@@ -417,7 +417,7 @@
         _mm256_add_epi32(_mm256_and_si256(vq31prod89ABCDEF, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod89ABCDEF));
 
       const __m256i vremainder_threshold = _mm256_load_si256((const __m256i*) params->avx2.remainder_threshold);
-      const __m128i vshift = _mm_load_si128((const __m128i*) params->avx2.shift);
+      const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->avx2.shift);
       vacc01234567 =
         _mm256_sub_epi32(_mm256_sra_epi32(vq31prod01234567, vshift), _mm256_cmpgt_epi32(vrem01234567, vremainder_threshold));
       vacc89ABCDEF =
diff --git a/src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-avx512skx-mul32.c b/src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-avx512skx-mul32.c
index 5cdbe0f..864e168 100644
--- a/src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-avx512skx-mul32.c
+++ b/src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-avx512skx-mul32.c
@@ -15,7 +15,7 @@
 #include <xnnpack/intrinsics-polyfill.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up16x25__avx512skx_mul32(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up16x25__avx512skx_mul32(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-neon-mul16.c b/src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-neon-mul16.c
index c5e4158..d8715ce 100644
--- a/src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-neon-mul16.c
+++ b/src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-neon-mul16.c
@@ -14,7 +14,7 @@
 #include <xnnpack/dwconv.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up16x25__neon_mul16(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up16x25__neon_mul16(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-sse2-mul16.c b/src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-sse2-mul16.c
index 13a5160..569cc79 100644
--- a/src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-sse2-mul16.c
+++ b/src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-sse2-mul16.c
@@ -14,7 +14,7 @@
 #include <xnnpack/dwconv.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up16x25__sse2_mul16(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up16x25__sse2_mul16(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-sse41-mul16.c b/src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-sse41-mul16.c
index 16037c0..52f071a 100644
--- a/src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-sse41-mul16.c
+++ b/src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-sse41-mul16.c
@@ -14,7 +14,7 @@
 #include <xnnpack/dwconv.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up16x25__sse41_mul16(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up16x25__sse41_mul16(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-sse41-mul32.c b/src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-sse41-mul32.c
index f332747..3343cc4 100644
--- a/src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-sse41-mul32.c
+++ b/src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-sse41-mul32.c
@@ -15,7 +15,7 @@
 #include <xnnpack/intrinsics-polyfill.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up16x25__sse41_mul32(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up16x25__sse41_mul32(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-ssse3-mul16.c b/src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-ssse3-mul16.c
index 938d0e5..8fa383e 100644
--- a/src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-ssse3-mul16.c
+++ b/src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-ssse3-mul16.c
@@ -14,7 +14,7 @@
 #include <xnnpack/dwconv.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up16x25__ssse3_mul16(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up16x25__ssse3_mul16(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-wasmsimd-mul16.c b/src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-wasmsimd-mul16.c
index 85135de..78c11a6 100644
--- a/src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-wasmsimd-mul16.c
+++ b/src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-wasmsimd-mul16.c
@@ -14,7 +14,7 @@
 #include <xnnpack/dwconv.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up16x25__wasmsimd_mul16(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up16x25__wasmsimd_mul16(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-xop-mul32.c b/src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-xop-mul32.c
index 9dc6211..aebaf30 100644
--- a/src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-xop-mul32.c
+++ b/src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-xop-mul32.c
@@ -20,7 +20,7 @@
 #include <xnnpack/intrinsics-polyfill.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up16x25__xop_mul32(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up16x25__xop_mul32(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-avx-mul16.c b/src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-avx-mul16.c
index a8bb56a..8bac6f5 100644
--- a/src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-avx-mul16.c
+++ b/src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-avx-mul16.c
@@ -14,7 +14,7 @@
 #include <xnnpack/dwconv.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up16x9__avx_mul16(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up16x9__avx_mul16(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-avx-mul32.c b/src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-avx-mul32.c
index 45c2fa8..555f739 100644
--- a/src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-avx-mul32.c
+++ b/src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-avx-mul32.c
@@ -15,7 +15,7 @@
 #include <xnnpack/intrinsics-polyfill.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up16x9__avx_mul32(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up16x9__avx_mul32(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-avx2-mul16.c b/src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-avx2-mul16.c
index b1db9e6..e478bae 100644
--- a/src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-avx2-mul16.c
+++ b/src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-avx2-mul16.c
@@ -14,7 +14,7 @@
 #include <xnnpack/dwconv.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up16x9__avx2_mul16(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up16x9__avx2_mul16(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-avx2-mul32.c b/src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-avx2-mul32.c
index b03ec06..ae760d2 100644
--- a/src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-avx2-mul32.c
+++ b/src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-avx2-mul32.c
@@ -14,7 +14,7 @@
 #include <xnnpack/dwconv.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up16x9__avx2_mul32(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up16x9__avx2_mul32(
     size_t channels,
     size_t output_width,
     const int8_t** input,
@@ -193,7 +193,7 @@
         _mm256_add_epi32(_mm256_and_si256(vq31prod89ABCDEF, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod89ABCDEF));
 
       const __m256i vremainder_threshold = _mm256_load_si256((const __m256i*) params->avx2.remainder_threshold);
-      const __m128i vshift = _mm_load_si128((const __m128i*) params->avx2.shift);
+      const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->avx2.shift);
       vacc01234567 =
         _mm256_sub_epi32(_mm256_sra_epi32(vq31prod01234567, vshift), _mm256_cmpgt_epi32(vrem01234567, vremainder_threshold));
       vacc89ABCDEF =
diff --git a/src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-avx512skx-mul32.c b/src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-avx512skx-mul32.c
index 78c542a..c789d11 100644
--- a/src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-avx512skx-mul32.c
+++ b/src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-avx512skx-mul32.c
@@ -15,7 +15,7 @@
 #include <xnnpack/intrinsics-polyfill.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up16x9__avx512skx_mul32(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up16x9__avx512skx_mul32(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-neon-mul16.c b/src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-neon-mul16.c
index 6daee4f..c11dc32 100644
--- a/src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-neon-mul16.c
+++ b/src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-neon-mul16.c
@@ -14,7 +14,7 @@
 #include <xnnpack/dwconv.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up16x9__neon_mul16(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up16x9__neon_mul16(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-sse2-mul16.c b/src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-sse2-mul16.c
index 96df3cd..ec641b7 100644
--- a/src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-sse2-mul16.c
+++ b/src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-sse2-mul16.c
@@ -14,7 +14,7 @@
 #include <xnnpack/dwconv.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up16x9__sse2_mul16(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up16x9__sse2_mul16(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-sse41-mul16.c b/src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-sse41-mul16.c
index eadc573..a6f5b2f 100644
--- a/src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-sse41-mul16.c
+++ b/src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-sse41-mul16.c
@@ -14,7 +14,7 @@
 #include <xnnpack/dwconv.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up16x9__sse41_mul16(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-sse41-mul32.c b/src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-sse41-mul32.c
index 580cfb2..d6e298b 100644
--- a/src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-sse41-mul32.c
+++ b/src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-sse41-mul32.c
@@ -15,7 +15,7 @@
 #include <xnnpack/intrinsics-polyfill.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul32(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up16x9__sse41_mul32(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-ssse3-mul16.c b/src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-ssse3-mul16.c
index 3d148fe..3dc45b4 100644
--- a/src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-ssse3-mul16.c
+++ b/src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-ssse3-mul16.c
@@ -14,7 +14,7 @@
 #include <xnnpack/dwconv.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up16x9__ssse3_mul16(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-wasmsimd-mul16.c b/src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-wasmsimd-mul16.c
index eb7b3cf..ea1b223 100644
--- a/src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-wasmsimd-mul16.c
+++ b/src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-wasmsimd-mul16.c
@@ -14,7 +14,7 @@
 #include <xnnpack/dwconv.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up16x9__wasmsimd_mul16(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up16x9__wasmsimd_mul16(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-xop-mul32.c b/src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-xop-mul32.c
index d1590b2..a91d509 100644
--- a/src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-xop-mul32.c
+++ b/src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-xop-mul32.c
@@ -20,7 +20,7 @@
 #include <xnnpack/intrinsics-polyfill.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up16x9__xop_mul32(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up16x9__xop_mul32(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/gen/up1x25-minmax-gemmlowp-scalar.c b/src/qs8-dwconv/gen/up1x25-minmax-gemmlowp-scalar.c
index 05aaa74..b831f10 100644
--- a/src/qs8-dwconv/gen/up1x25-minmax-gemmlowp-scalar.c
+++ b/src/qs8-dwconv/gen/up1x25-minmax-gemmlowp-scalar.c
@@ -14,7 +14,7 @@
 #include <xnnpack/scalar-utils.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up1x25__scalar(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up1x25__scalar(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/gen/up1x9-minmax-gemmlowp-scalar.c b/src/qs8-dwconv/gen/up1x9-minmax-gemmlowp-scalar.c
index 29523f7..ec4ab28 100644
--- a/src/qs8-dwconv/gen/up1x9-minmax-gemmlowp-scalar.c
+++ b/src/qs8-dwconv/gen/up1x9-minmax-gemmlowp-scalar.c
@@ -14,7 +14,7 @@
 #include <xnnpack/scalar-utils.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up1x9__scalar(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up1x9__scalar(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/gen/up24x25-minmax-gemmlowp-avx-mul16.c b/src/qs8-dwconv/gen/up24x25-minmax-gemmlowp-avx-mul16.c
index 5040e72..bf88d5e 100644
--- a/src/qs8-dwconv/gen/up24x25-minmax-gemmlowp-avx-mul16.c
+++ b/src/qs8-dwconv/gen/up24x25-minmax-gemmlowp-avx-mul16.c
@@ -14,7 +14,7 @@
 #include <xnnpack/dwconv.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up24x25__avx_mul16(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up24x25__avx_mul16(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/gen/up24x25-minmax-gemmlowp-avx-mul32.c b/src/qs8-dwconv/gen/up24x25-minmax-gemmlowp-avx-mul32.c
index f32c5d5..d113713 100644
--- a/src/qs8-dwconv/gen/up24x25-minmax-gemmlowp-avx-mul32.c
+++ b/src/qs8-dwconv/gen/up24x25-minmax-gemmlowp-avx-mul32.c
@@ -15,7 +15,7 @@
 #include <xnnpack/intrinsics-polyfill.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up24x25__avx_mul32(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up24x25__avx_mul32(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/gen/up24x25-minmax-gemmlowp-avx2-mul32.c b/src/qs8-dwconv/gen/up24x25-minmax-gemmlowp-avx2-mul32.c
index 30157d0..8961bd8 100644
--- a/src/qs8-dwconv/gen/up24x25-minmax-gemmlowp-avx2-mul32.c
+++ b/src/qs8-dwconv/gen/up24x25-minmax-gemmlowp-avx2-mul32.c
@@ -14,7 +14,7 @@
 #include <xnnpack/dwconv.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up24x25__avx2_mul32(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up24x25__avx2_mul32(
     size_t channels,
     size_t output_width,
     const int8_t** input,
@@ -501,7 +501,7 @@
         _mm256_add_epi32(_mm256_and_si256(vq31prodGHIJKLMN, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prodGHIJKLMN));
 
       const __m256i vremainder_threshold = _mm256_load_si256((const __m256i*) params->avx2.remainder_threshold);
-      const __m128i vshift = _mm_load_si128((const __m128i*) params->avx2.shift);
+      const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->avx2.shift);
       vacc01234567 =
         _mm256_sub_epi32(_mm256_sra_epi32(vq31prod01234567, vshift), _mm256_cmpgt_epi32(vrem01234567, vremainder_threshold));
       vacc89ABCDEF =
diff --git a/src/qs8-dwconv/gen/up24x25-minmax-gemmlowp-neon-mul16.c b/src/qs8-dwconv/gen/up24x25-minmax-gemmlowp-neon-mul16.c
index ac87af0..53d2bf9 100644
--- a/src/qs8-dwconv/gen/up24x25-minmax-gemmlowp-neon-mul16.c
+++ b/src/qs8-dwconv/gen/up24x25-minmax-gemmlowp-neon-mul16.c
@@ -14,7 +14,7 @@
 #include <xnnpack/dwconv.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up24x25__neon_mul16(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up24x25__neon_mul16(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/gen/up24x25-minmax-gemmlowp-sse2-mul16.c b/src/qs8-dwconv/gen/up24x25-minmax-gemmlowp-sse2-mul16.c
index 8ba83d2..7023fc4 100644
--- a/src/qs8-dwconv/gen/up24x25-minmax-gemmlowp-sse2-mul16.c
+++ b/src/qs8-dwconv/gen/up24x25-minmax-gemmlowp-sse2-mul16.c
@@ -14,7 +14,7 @@
 #include <xnnpack/dwconv.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up24x25__sse2_mul16(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up24x25__sse2_mul16(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/gen/up24x25-minmax-gemmlowp-sse41-mul16.c b/src/qs8-dwconv/gen/up24x25-minmax-gemmlowp-sse41-mul16.c
index d0069bb..340f5f3 100644
--- a/src/qs8-dwconv/gen/up24x25-minmax-gemmlowp-sse41-mul16.c
+++ b/src/qs8-dwconv/gen/up24x25-minmax-gemmlowp-sse41-mul16.c
@@ -14,7 +14,7 @@
 #include <xnnpack/dwconv.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up24x25__sse41_mul16(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up24x25__sse41_mul16(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/gen/up24x25-minmax-gemmlowp-sse41-mul32.c b/src/qs8-dwconv/gen/up24x25-minmax-gemmlowp-sse41-mul32.c
index 3309700..04ebdc9 100644
--- a/src/qs8-dwconv/gen/up24x25-minmax-gemmlowp-sse41-mul32.c
+++ b/src/qs8-dwconv/gen/up24x25-minmax-gemmlowp-sse41-mul32.c
@@ -15,7 +15,7 @@
 #include <xnnpack/intrinsics-polyfill.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up24x25__sse41_mul32(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up24x25__sse41_mul32(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/gen/up24x25-minmax-gemmlowp-ssse3-mul16.c b/src/qs8-dwconv/gen/up24x25-minmax-gemmlowp-ssse3-mul16.c
index 499ab88..30e1f03 100644
--- a/src/qs8-dwconv/gen/up24x25-minmax-gemmlowp-ssse3-mul16.c
+++ b/src/qs8-dwconv/gen/up24x25-minmax-gemmlowp-ssse3-mul16.c
@@ -14,7 +14,7 @@
 #include <xnnpack/dwconv.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up24x25__ssse3_mul16(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up24x25__ssse3_mul16(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/gen/up24x25-minmax-gemmlowp-wasmsimd-mul16.c b/src/qs8-dwconv/gen/up24x25-minmax-gemmlowp-wasmsimd-mul16.c
index e0c71db..64a9281 100644
--- a/src/qs8-dwconv/gen/up24x25-minmax-gemmlowp-wasmsimd-mul16.c
+++ b/src/qs8-dwconv/gen/up24x25-minmax-gemmlowp-wasmsimd-mul16.c
@@ -14,7 +14,7 @@
 #include <xnnpack/dwconv.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up24x25__wasmsimd_mul16(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up24x25__wasmsimd_mul16(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/gen/up24x25-minmax-gemmlowp-xop-mul32.c b/src/qs8-dwconv/gen/up24x25-minmax-gemmlowp-xop-mul32.c
index fae92d2..f4ecccb 100644
--- a/src/qs8-dwconv/gen/up24x25-minmax-gemmlowp-xop-mul32.c
+++ b/src/qs8-dwconv/gen/up24x25-minmax-gemmlowp-xop-mul32.c
@@ -20,7 +20,7 @@
 #include <xnnpack/intrinsics-polyfill.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up24x25__xop_mul32(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up24x25__xop_mul32(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/gen/up24x9-minmax-gemmlowp-avx-mul16.c b/src/qs8-dwconv/gen/up24x9-minmax-gemmlowp-avx-mul16.c
index 1582948..28eb3a3 100644
--- a/src/qs8-dwconv/gen/up24x9-minmax-gemmlowp-avx-mul16.c
+++ b/src/qs8-dwconv/gen/up24x9-minmax-gemmlowp-avx-mul16.c
@@ -14,7 +14,7 @@
 #include <xnnpack/dwconv.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up24x9__avx_mul16(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up24x9__avx_mul16(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/gen/up24x9-minmax-gemmlowp-avx-mul32.c b/src/qs8-dwconv/gen/up24x9-minmax-gemmlowp-avx-mul32.c
index 32bb9dd..407af20 100644
--- a/src/qs8-dwconv/gen/up24x9-minmax-gemmlowp-avx-mul32.c
+++ b/src/qs8-dwconv/gen/up24x9-minmax-gemmlowp-avx-mul32.c
@@ -15,7 +15,7 @@
 #include <xnnpack/intrinsics-polyfill.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up24x9__avx_mul32(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up24x9__avx_mul32(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/gen/up24x9-minmax-gemmlowp-avx2-mul32.c b/src/qs8-dwconv/gen/up24x9-minmax-gemmlowp-avx2-mul32.c
index 26221c8..b652b3d 100644
--- a/src/qs8-dwconv/gen/up24x9-minmax-gemmlowp-avx2-mul32.c
+++ b/src/qs8-dwconv/gen/up24x9-minmax-gemmlowp-avx2-mul32.c
@@ -14,7 +14,7 @@
 #include <xnnpack/dwconv.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up24x9__avx2_mul32(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up24x9__avx2_mul32(
     size_t channels,
     size_t output_width,
     const int8_t** input,
@@ -229,7 +229,7 @@
         _mm256_add_epi32(_mm256_and_si256(vq31prodGHIJKLMN, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prodGHIJKLMN));
 
       const __m256i vremainder_threshold = _mm256_load_si256((const __m256i*) params->avx2.remainder_threshold);
-      const __m128i vshift = _mm_load_si128((const __m128i*) params->avx2.shift);
+      const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->avx2.shift);
       vacc01234567 =
         _mm256_sub_epi32(_mm256_sra_epi32(vq31prod01234567, vshift), _mm256_cmpgt_epi32(vrem01234567, vremainder_threshold));
       vacc89ABCDEF =
diff --git a/src/qs8-dwconv/gen/up24x9-minmax-gemmlowp-neon-mul16.c b/src/qs8-dwconv/gen/up24x9-minmax-gemmlowp-neon-mul16.c
index 0de9001..d1844da 100644
--- a/src/qs8-dwconv/gen/up24x9-minmax-gemmlowp-neon-mul16.c
+++ b/src/qs8-dwconv/gen/up24x9-minmax-gemmlowp-neon-mul16.c
@@ -14,7 +14,7 @@
 #include <xnnpack/dwconv.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up24x9__neon_mul16(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up24x9__neon_mul16(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/gen/up24x9-minmax-gemmlowp-sse2-mul16.c b/src/qs8-dwconv/gen/up24x9-minmax-gemmlowp-sse2-mul16.c
index c2a24ed..95d69d2 100644
--- a/src/qs8-dwconv/gen/up24x9-minmax-gemmlowp-sse2-mul16.c
+++ b/src/qs8-dwconv/gen/up24x9-minmax-gemmlowp-sse2-mul16.c
@@ -14,7 +14,7 @@
 #include <xnnpack/dwconv.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up24x9__sse2_mul16(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/gen/up24x9-minmax-gemmlowp-sse41-mul16.c b/src/qs8-dwconv/gen/up24x9-minmax-gemmlowp-sse41-mul16.c
index b681da4..1905f55 100644
--- a/src/qs8-dwconv/gen/up24x9-minmax-gemmlowp-sse41-mul16.c
+++ b/src/qs8-dwconv/gen/up24x9-minmax-gemmlowp-sse41-mul16.c
@@ -14,7 +14,7 @@
 #include <xnnpack/dwconv.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up24x9__sse41_mul16(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/gen/up24x9-minmax-gemmlowp-sse41-mul32.c b/src/qs8-dwconv/gen/up24x9-minmax-gemmlowp-sse41-mul32.c
index 65decb9..5efb25d 100644
--- a/src/qs8-dwconv/gen/up24x9-minmax-gemmlowp-sse41-mul32.c
+++ b/src/qs8-dwconv/gen/up24x9-minmax-gemmlowp-sse41-mul32.c
@@ -15,7 +15,7 @@
 #include <xnnpack/intrinsics-polyfill.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul32(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up24x9__sse41_mul32(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/gen/up24x9-minmax-gemmlowp-ssse3-mul16.c b/src/qs8-dwconv/gen/up24x9-minmax-gemmlowp-ssse3-mul16.c
index 46cd248..7dbe948 100644
--- a/src/qs8-dwconv/gen/up24x9-minmax-gemmlowp-ssse3-mul16.c
+++ b/src/qs8-dwconv/gen/up24x9-minmax-gemmlowp-ssse3-mul16.c
@@ -14,7 +14,7 @@
 #include <xnnpack/dwconv.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up24x9__ssse3_mul16(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up24x9__ssse3_mul16(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/gen/up24x9-minmax-gemmlowp-wasmsimd-mul16.c b/src/qs8-dwconv/gen/up24x9-minmax-gemmlowp-wasmsimd-mul16.c
index f5efd68..edca769 100644
--- a/src/qs8-dwconv/gen/up24x9-minmax-gemmlowp-wasmsimd-mul16.c
+++ b/src/qs8-dwconv/gen/up24x9-minmax-gemmlowp-wasmsimd-mul16.c
@@ -14,7 +14,7 @@
 #include <xnnpack/dwconv.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up24x9__wasmsimd_mul16(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up24x9__wasmsimd_mul16(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/gen/up24x9-minmax-gemmlowp-xop-mul32.c b/src/qs8-dwconv/gen/up24x9-minmax-gemmlowp-xop-mul32.c
index 9aa0bda..9c04013 100644
--- a/src/qs8-dwconv/gen/up24x9-minmax-gemmlowp-xop-mul32.c
+++ b/src/qs8-dwconv/gen/up24x9-minmax-gemmlowp-xop-mul32.c
@@ -20,7 +20,7 @@
 #include <xnnpack/intrinsics-polyfill.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up24x9__xop_mul32(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up24x9__xop_mul32(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/gen/up2x25-minmax-gemmlowp-scalar.c b/src/qs8-dwconv/gen/up2x25-minmax-gemmlowp-scalar.c
index aa314b5..1bcc61d 100644
--- a/src/qs8-dwconv/gen/up2x25-minmax-gemmlowp-scalar.c
+++ b/src/qs8-dwconv/gen/up2x25-minmax-gemmlowp-scalar.c
@@ -14,7 +14,7 @@
 #include <xnnpack/scalar-utils.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up2x25__scalar(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up2x25__scalar(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/gen/up2x9-minmax-gemmlowp-scalar.c b/src/qs8-dwconv/gen/up2x9-minmax-gemmlowp-scalar.c
index 3e35cf3..5c67cf4 100644
--- a/src/qs8-dwconv/gen/up2x9-minmax-gemmlowp-scalar.c
+++ b/src/qs8-dwconv/gen/up2x9-minmax-gemmlowp-scalar.c
@@ -14,7 +14,7 @@
 #include <xnnpack/scalar-utils.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up2x9__scalar(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up2x9__scalar(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/gen/up32x25-minmax-gemmlowp-avx2-mul16.c b/src/qs8-dwconv/gen/up32x25-minmax-gemmlowp-avx2-mul16.c
index e22d41f..7ca30b5 100644
--- a/src/qs8-dwconv/gen/up32x25-minmax-gemmlowp-avx2-mul16.c
+++ b/src/qs8-dwconv/gen/up32x25-minmax-gemmlowp-avx2-mul16.c
@@ -14,7 +14,7 @@
 #include <xnnpack/dwconv.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up32x25__avx2_mul16(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up32x25__avx2_mul16(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/gen/up32x25-minmax-gemmlowp-avx2-mul32.c b/src/qs8-dwconv/gen/up32x25-minmax-gemmlowp-avx2-mul32.c
index 268ac2a..8973c9d 100644
--- a/src/qs8-dwconv/gen/up32x25-minmax-gemmlowp-avx2-mul32.c
+++ b/src/qs8-dwconv/gen/up32x25-minmax-gemmlowp-avx2-mul32.c
@@ -14,7 +14,7 @@
 #include <xnnpack/dwconv.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up32x25__avx2_mul32(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up32x25__avx2_mul32(
     size_t channels,
     size_t output_width,
     const int8_t** input,
@@ -585,7 +585,7 @@
         _mm256_add_epi32(_mm256_and_si256(vq31prodOPQRSTUV, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prodOPQRSTUV));
 
       const __m256i vremainder_threshold = _mm256_load_si256((const __m256i*) params->avx2.remainder_threshold);
-      const __m128i vshift = _mm_load_si128((const __m128i*) params->avx2.shift);
+      const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->avx2.shift);
       vacc01234567 =
         _mm256_sub_epi32(_mm256_sra_epi32(vq31prod01234567, vshift), _mm256_cmpgt_epi32(vrem01234567, vremainder_threshold));
       vacc89ABCDEF =
diff --git a/src/qs8-dwconv/gen/up32x25-minmax-gemmlowp-avx512skx-mul32.c b/src/qs8-dwconv/gen/up32x25-minmax-gemmlowp-avx512skx-mul32.c
index 20d7dd2..72cf2f0 100644
--- a/src/qs8-dwconv/gen/up32x25-minmax-gemmlowp-avx512skx-mul32.c
+++ b/src/qs8-dwconv/gen/up32x25-minmax-gemmlowp-avx512skx-mul32.c
@@ -15,7 +15,7 @@
 #include <xnnpack/intrinsics-polyfill.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up32x25__avx512skx_mul32(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up32x25__avx512skx_mul32(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/gen/up32x25-minmax-gemmlowp-neon-mul16.c b/src/qs8-dwconv/gen/up32x25-minmax-gemmlowp-neon-mul16.c
index 558a399..b5cf1f2 100644
--- a/src/qs8-dwconv/gen/up32x25-minmax-gemmlowp-neon-mul16.c
+++ b/src/qs8-dwconv/gen/up32x25-minmax-gemmlowp-neon-mul16.c
@@ -14,7 +14,7 @@
 #include <xnnpack/dwconv.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up32x25__neon_mul16(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up32x25__neon_mul16(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/gen/up32x9-minmax-gemmlowp-avx2-mul16.c b/src/qs8-dwconv/gen/up32x9-minmax-gemmlowp-avx2-mul16.c
index a7be837..bf634ea 100644
--- a/src/qs8-dwconv/gen/up32x9-minmax-gemmlowp-avx2-mul16.c
+++ b/src/qs8-dwconv/gen/up32x9-minmax-gemmlowp-avx2-mul16.c
@@ -14,7 +14,7 @@
 #include <xnnpack/dwconv.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up32x9__avx2_mul16(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up32x9__avx2_mul16(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/gen/up32x9-minmax-gemmlowp-avx2-mul32.c b/src/qs8-dwconv/gen/up32x9-minmax-gemmlowp-avx2-mul32.c
index 1e2a93e..3eb639a 100644
--- a/src/qs8-dwconv/gen/up32x9-minmax-gemmlowp-avx2-mul32.c
+++ b/src/qs8-dwconv/gen/up32x9-minmax-gemmlowp-avx2-mul32.c
@@ -14,7 +14,7 @@
 #include <xnnpack/dwconv.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up32x9__avx2_mul32(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up32x9__avx2_mul32(
     size_t channels,
     size_t output_width,
     const int8_t** input,
@@ -265,7 +265,7 @@
         _mm256_add_epi32(_mm256_and_si256(vq31prodOPQRSTUV, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prodOPQRSTUV));
 
       const __m256i vremainder_threshold = _mm256_load_si256((const __m256i*) params->avx2.remainder_threshold);
-      const __m128i vshift = _mm_load_si128((const __m128i*) params->avx2.shift);
+      const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->avx2.shift);
       vacc01234567 =
         _mm256_sub_epi32(_mm256_sra_epi32(vq31prod01234567, vshift), _mm256_cmpgt_epi32(vrem01234567, vremainder_threshold));
       vacc89ABCDEF =
diff --git a/src/qs8-dwconv/gen/up32x9-minmax-gemmlowp-avx512skx-mul32.c b/src/qs8-dwconv/gen/up32x9-minmax-gemmlowp-avx512skx-mul32.c
index d9e52a7..b321986 100644
--- a/src/qs8-dwconv/gen/up32x9-minmax-gemmlowp-avx512skx-mul32.c
+++ b/src/qs8-dwconv/gen/up32x9-minmax-gemmlowp-avx512skx-mul32.c
@@ -15,7 +15,7 @@
 #include <xnnpack/intrinsics-polyfill.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up32x9__avx512skx_mul32(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up32x9__avx512skx_mul32(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/gen/up32x9-minmax-gemmlowp-neon-mul16.c b/src/qs8-dwconv/gen/up32x9-minmax-gemmlowp-neon-mul16.c
index e0e6458..5477ee3 100644
--- a/src/qs8-dwconv/gen/up32x9-minmax-gemmlowp-neon-mul16.c
+++ b/src/qs8-dwconv/gen/up32x9-minmax-gemmlowp-neon-mul16.c
@@ -14,7 +14,7 @@
 #include <xnnpack/dwconv.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up32x9__neon_mul16(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up32x9__neon_mul16(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/gen/up4x25-minmax-gemmlowp-scalar.c b/src/qs8-dwconv/gen/up4x25-minmax-gemmlowp-scalar.c
index 33992d5..56f74fb 100644
--- a/src/qs8-dwconv/gen/up4x25-minmax-gemmlowp-scalar.c
+++ b/src/qs8-dwconv/gen/up4x25-minmax-gemmlowp-scalar.c
@@ -14,7 +14,7 @@
 #include <xnnpack/scalar-utils.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up4x25__scalar(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up4x25__scalar(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/gen/up4x9-minmax-gemmlowp-scalar.c b/src/qs8-dwconv/gen/up4x9-minmax-gemmlowp-scalar.c
index 6cd245b..8980ff2 100644
--- a/src/qs8-dwconv/gen/up4x9-minmax-gemmlowp-scalar.c
+++ b/src/qs8-dwconv/gen/up4x9-minmax-gemmlowp-scalar.c
@@ -14,7 +14,7 @@
 #include <xnnpack/scalar-utils.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up4x9__scalar(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up4x9__scalar(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/gen/up8x25-minmax-gemmlowp-avx-mul16.c b/src/qs8-dwconv/gen/up8x25-minmax-gemmlowp-avx-mul16.c
index b23e417..232974b 100644
--- a/src/qs8-dwconv/gen/up8x25-minmax-gemmlowp-avx-mul16.c
+++ b/src/qs8-dwconv/gen/up8x25-minmax-gemmlowp-avx-mul16.c
@@ -14,7 +14,7 @@
 #include <xnnpack/dwconv.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up8x25__avx_mul16(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up8x25__avx_mul16(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/gen/up8x25-minmax-gemmlowp-avx-mul32.c b/src/qs8-dwconv/gen/up8x25-minmax-gemmlowp-avx-mul32.c
index 889b7a8..491e608 100644
--- a/src/qs8-dwconv/gen/up8x25-minmax-gemmlowp-avx-mul32.c
+++ b/src/qs8-dwconv/gen/up8x25-minmax-gemmlowp-avx-mul32.c
@@ -15,7 +15,7 @@
 #include <xnnpack/intrinsics-polyfill.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up8x25__avx_mul32(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up8x25__avx_mul32(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/gen/up8x25-minmax-gemmlowp-avx2-mul32.c b/src/qs8-dwconv/gen/up8x25-minmax-gemmlowp-avx2-mul32.c
index a298d77..29996a5 100644
--- a/src/qs8-dwconv/gen/up8x25-minmax-gemmlowp-avx2-mul32.c
+++ b/src/qs8-dwconv/gen/up8x25-minmax-gemmlowp-avx2-mul32.c
@@ -14,7 +14,7 @@
 #include <xnnpack/dwconv.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up8x25__avx2_mul32(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up8x25__avx2_mul32(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/gen/up8x25-minmax-gemmlowp-neon-mul16.c b/src/qs8-dwconv/gen/up8x25-minmax-gemmlowp-neon-mul16.c
index 71b69f2..21dd238 100644
--- a/src/qs8-dwconv/gen/up8x25-minmax-gemmlowp-neon-mul16.c
+++ b/src/qs8-dwconv/gen/up8x25-minmax-gemmlowp-neon-mul16.c
@@ -14,7 +14,7 @@
 #include <xnnpack/dwconv.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up8x25__neon_mul16(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up8x25__neon_mul16(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/gen/up8x25-minmax-gemmlowp-sse2-mul16.c b/src/qs8-dwconv/gen/up8x25-minmax-gemmlowp-sse2-mul16.c
index 498844c..161dcd7 100644
--- a/src/qs8-dwconv/gen/up8x25-minmax-gemmlowp-sse2-mul16.c
+++ b/src/qs8-dwconv/gen/up8x25-minmax-gemmlowp-sse2-mul16.c
@@ -14,7 +14,7 @@
 #include <xnnpack/dwconv.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up8x25__sse2_mul16(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up8x25__sse2_mul16(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/gen/up8x25-minmax-gemmlowp-sse41-mul16.c b/src/qs8-dwconv/gen/up8x25-minmax-gemmlowp-sse41-mul16.c
index 45dfe82..53f696c 100644
--- a/src/qs8-dwconv/gen/up8x25-minmax-gemmlowp-sse41-mul16.c
+++ b/src/qs8-dwconv/gen/up8x25-minmax-gemmlowp-sse41-mul16.c
@@ -14,7 +14,7 @@
 #include <xnnpack/dwconv.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up8x25__sse41_mul16(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up8x25__sse41_mul16(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/gen/up8x25-minmax-gemmlowp-sse41-mul32.c b/src/qs8-dwconv/gen/up8x25-minmax-gemmlowp-sse41-mul32.c
index 3ccbf5a..049dc5b 100644
--- a/src/qs8-dwconv/gen/up8x25-minmax-gemmlowp-sse41-mul32.c
+++ b/src/qs8-dwconv/gen/up8x25-minmax-gemmlowp-sse41-mul32.c
@@ -15,7 +15,7 @@
 #include <xnnpack/intrinsics-polyfill.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up8x25__sse41_mul32(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up8x25__sse41_mul32(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/gen/up8x25-minmax-gemmlowp-ssse3-mul16.c b/src/qs8-dwconv/gen/up8x25-minmax-gemmlowp-ssse3-mul16.c
index 5704b3b..432bfa2 100644
--- a/src/qs8-dwconv/gen/up8x25-minmax-gemmlowp-ssse3-mul16.c
+++ b/src/qs8-dwconv/gen/up8x25-minmax-gemmlowp-ssse3-mul16.c
@@ -14,7 +14,7 @@
 #include <xnnpack/dwconv.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up8x25__ssse3_mul16(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up8x25__ssse3_mul16(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/gen/up8x25-minmax-gemmlowp-wasmsimd-mul16.c b/src/qs8-dwconv/gen/up8x25-minmax-gemmlowp-wasmsimd-mul16.c
index 38345b4..9ed4d64 100644
--- a/src/qs8-dwconv/gen/up8x25-minmax-gemmlowp-wasmsimd-mul16.c
+++ b/src/qs8-dwconv/gen/up8x25-minmax-gemmlowp-wasmsimd-mul16.c
@@ -14,7 +14,7 @@
 #include <xnnpack/dwconv.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up8x25__wasmsimd_mul16(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up8x25__wasmsimd_mul16(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/gen/up8x25-minmax-gemmlowp-xop-mul32.c b/src/qs8-dwconv/gen/up8x25-minmax-gemmlowp-xop-mul32.c
index a4467b8..34957ae 100644
--- a/src/qs8-dwconv/gen/up8x25-minmax-gemmlowp-xop-mul32.c
+++ b/src/qs8-dwconv/gen/up8x25-minmax-gemmlowp-xop-mul32.c
@@ -20,7 +20,7 @@
 #include <xnnpack/intrinsics-polyfill.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up8x25__xop_mul32(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up8x25__xop_mul32(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/gen/up8x9-minmax-gemmlowp-avx-mul16.c b/src/qs8-dwconv/gen/up8x9-minmax-gemmlowp-avx-mul16.c
index 7f4f9e7..ee1e262 100644
--- a/src/qs8-dwconv/gen/up8x9-minmax-gemmlowp-avx-mul16.c
+++ b/src/qs8-dwconv/gen/up8x9-minmax-gemmlowp-avx-mul16.c
@@ -14,7 +14,7 @@
 #include <xnnpack/dwconv.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up8x9__avx_mul16(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up8x9__avx_mul16(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/gen/up8x9-minmax-gemmlowp-avx-mul32.c b/src/qs8-dwconv/gen/up8x9-minmax-gemmlowp-avx-mul32.c
index 20ae162..2e938df 100644
--- a/src/qs8-dwconv/gen/up8x9-minmax-gemmlowp-avx-mul32.c
+++ b/src/qs8-dwconv/gen/up8x9-minmax-gemmlowp-avx-mul32.c
@@ -15,7 +15,7 @@
 #include <xnnpack/intrinsics-polyfill.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up8x9__avx_mul32(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up8x9__avx_mul32(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/gen/up8x9-minmax-gemmlowp-avx2-mul32.c b/src/qs8-dwconv/gen/up8x9-minmax-gemmlowp-avx2-mul32.c
index fdebd15..3f11447 100644
--- a/src/qs8-dwconv/gen/up8x9-minmax-gemmlowp-avx2-mul32.c
+++ b/src/qs8-dwconv/gen/up8x9-minmax-gemmlowp-avx2-mul32.c
@@ -14,7 +14,7 @@
 #include <xnnpack/dwconv.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up8x9__avx2_mul32(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up8x9__avx2_mul32(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/gen/up8x9-minmax-gemmlowp-neon-mul16.c b/src/qs8-dwconv/gen/up8x9-minmax-gemmlowp-neon-mul16.c
index 1b13e62..51492bd 100644
--- a/src/qs8-dwconv/gen/up8x9-minmax-gemmlowp-neon-mul16.c
+++ b/src/qs8-dwconv/gen/up8x9-minmax-gemmlowp-neon-mul16.c
@@ -14,7 +14,7 @@
 #include <xnnpack/dwconv.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up8x9__neon_mul16(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up8x9__neon_mul16(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/gen/up8x9-minmax-gemmlowp-sse2-mul16.c b/src/qs8-dwconv/gen/up8x9-minmax-gemmlowp-sse2-mul16.c
index d9bb1c1..78ed3c8 100644
--- a/src/qs8-dwconv/gen/up8x9-minmax-gemmlowp-sse2-mul16.c
+++ b/src/qs8-dwconv/gen/up8x9-minmax-gemmlowp-sse2-mul16.c
@@ -14,7 +14,7 @@
 #include <xnnpack/dwconv.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up8x9__sse2_mul16(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up8x9__sse2_mul16(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/gen/up8x9-minmax-gemmlowp-sse41-mul16.c b/src/qs8-dwconv/gen/up8x9-minmax-gemmlowp-sse41-mul16.c
index 0a422aa..4cad6ea 100644
--- a/src/qs8-dwconv/gen/up8x9-minmax-gemmlowp-sse41-mul16.c
+++ b/src/qs8-dwconv/gen/up8x9-minmax-gemmlowp-sse41-mul16.c
@@ -14,7 +14,7 @@
 #include <xnnpack/dwconv.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul16(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up8x9__sse41_mul16(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/gen/up8x9-minmax-gemmlowp-sse41-mul32.c b/src/qs8-dwconv/gen/up8x9-minmax-gemmlowp-sse41-mul32.c
index 4cb309e..2c1059c 100644
--- a/src/qs8-dwconv/gen/up8x9-minmax-gemmlowp-sse41-mul32.c
+++ b/src/qs8-dwconv/gen/up8x9-minmax-gemmlowp-sse41-mul32.c
@@ -15,7 +15,7 @@
 #include <xnnpack/intrinsics-polyfill.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul32(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up8x9__sse41_mul32(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/gen/up8x9-minmax-gemmlowp-ssse3-mul16.c b/src/qs8-dwconv/gen/up8x9-minmax-gemmlowp-ssse3-mul16.c
index b1f1651..23b45c8 100644
--- a/src/qs8-dwconv/gen/up8x9-minmax-gemmlowp-ssse3-mul16.c
+++ b/src/qs8-dwconv/gen/up8x9-minmax-gemmlowp-ssse3-mul16.c
@@ -14,7 +14,7 @@
 #include <xnnpack/dwconv.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up8x9__ssse3_mul16(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up8x9__ssse3_mul16(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/gen/up8x9-minmax-gemmlowp-wasmsimd-mul16.c b/src/qs8-dwconv/gen/up8x9-minmax-gemmlowp-wasmsimd-mul16.c
index cd229d5..3832f21 100644
--- a/src/qs8-dwconv/gen/up8x9-minmax-gemmlowp-wasmsimd-mul16.c
+++ b/src/qs8-dwconv/gen/up8x9-minmax-gemmlowp-wasmsimd-mul16.c
@@ -14,7 +14,7 @@
 #include <xnnpack/dwconv.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up8x9__wasmsimd_mul16(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up8x9__wasmsimd_mul16(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/gen/up8x9-minmax-gemmlowp-xop-mul32.c b/src/qs8-dwconv/gen/up8x9-minmax-gemmlowp-xop-mul32.c
index f3d8308..63b32dd 100644
--- a/src/qs8-dwconv/gen/up8x9-minmax-gemmlowp-xop-mul32.c
+++ b/src/qs8-dwconv/gen/up8x9-minmax-gemmlowp-xop-mul32.c
@@ -20,7 +20,7 @@
 #include <xnnpack/intrinsics-polyfill.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up8x9__xop_mul32(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up8x9__xop_mul32(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/unipass-avx2-mul16.c.in b/src/qs8-dwconv/unipass-avx2-mul16.c.in
index f61650f..4bcbe25 100644
--- a/src/qs8-dwconv/unipass-avx2-mul16.c.in
+++ b/src/qs8-dwconv/unipass-avx2-mul16.c.in
@@ -14,7 +14,7 @@
 #include <xnnpack/dwconv.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up${CHANNEL_TILE}x${KERNEL_TILE}__avx2_mul16(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up${CHANNEL_TILE}x${KERNEL_TILE}__avx2_mul16(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/unipass-avx2-mul32.c.in b/src/qs8-dwconv/unipass-avx2-mul32.c.in
index 8d808fc..072b747 100644
--- a/src/qs8-dwconv/unipass-avx2-mul32.c.in
+++ b/src/qs8-dwconv/unipass-avx2-mul32.c.in
@@ -14,7 +14,7 @@
 #include <xnnpack/dwconv.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up${CHANNEL_TILE}x${KERNEL_TILE}__avx2_mul32(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up${CHANNEL_TILE}x${KERNEL_TILE}__avx2_mul32(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/unipass-avx512skx-mul32.c.in b/src/qs8-dwconv/unipass-avx512skx-mul32.c.in
index 8889966..903961e 100644
--- a/src/qs8-dwconv/unipass-avx512skx-mul32.c.in
+++ b/src/qs8-dwconv/unipass-avx512skx-mul32.c.in
@@ -15,7 +15,7 @@
 #include <xnnpack/intrinsics-polyfill.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up${CHANNEL_TILE}x${KERNEL_TILE}__avx512skx_mul32(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up${CHANNEL_TILE}x${KERNEL_TILE}__avx512skx_mul32(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/unipass-neon-mul16.c.in b/src/qs8-dwconv/unipass-neon-mul16.c.in
index b9d8862..54714f9 100644
--- a/src/qs8-dwconv/unipass-neon-mul16.c.in
+++ b/src/qs8-dwconv/unipass-neon-mul16.c.in
@@ -14,7 +14,7 @@
 #include <xnnpack/dwconv.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up${CHANNEL_TILE}x${KERNEL_TILE}__neon_mul16(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up${CHANNEL_TILE}x${KERNEL_TILE}__neon_mul16(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/unipass-scalar.c.in b/src/qs8-dwconv/unipass-scalar.c.in
index 7cd25c6..cddbbc1 100644
--- a/src/qs8-dwconv/unipass-scalar.c.in
+++ b/src/qs8-dwconv/unipass-scalar.c.in
@@ -11,7 +11,7 @@
 #include <xnnpack/scalar-utils.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up${CHANNEL_TILE}x${KERNEL_TILE}__scalar(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up${CHANNEL_TILE}x${KERNEL_TILE}__scalar(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/unipass-sse-mul16.c.in b/src/qs8-dwconv/unipass-sse-mul16.c.in
index 5036dd7..a513992 100644
--- a/src/qs8-dwconv/unipass-sse-mul16.c.in
+++ b/src/qs8-dwconv/unipass-sse-mul16.c.in
@@ -17,7 +17,7 @@
 
 $PARAMS_STRUCT = "sse4" if SSE >= 4 else "sse2"
 $ISA = "avx" if AVX else {2: "sse2", 3: "ssse3", 4: "sse41"}[SSE]
-void xnn_qs8_dwconv_minmax_ukernel_up${CHANNEL_TILE}x${KERNEL_TILE}__${ISA}_mul16(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up${CHANNEL_TILE}x${KERNEL_TILE}__${ISA}_mul16(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/unipass-sse-mul32.c.in b/src/qs8-dwconv/unipass-sse-mul32.c.in
index 739501c..6e89d94 100644
--- a/src/qs8-dwconv/unipass-sse-mul32.c.in
+++ b/src/qs8-dwconv/unipass-sse-mul32.c.in
@@ -26,7 +26,7 @@
 
 
 $ISA = "xop" if XOP else "avx" if AVX else {4: "sse41"}[SSE]
-void xnn_qs8_dwconv_minmax_ukernel_up${CHANNEL_TILE}x${KERNEL_TILE}__${ISA}_mul32(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up${CHANNEL_TILE}x${KERNEL_TILE}__${ISA}_mul32(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-dwconv/unipass-wasmsimd-mul16.c.in b/src/qs8-dwconv/unipass-wasmsimd-mul16.c.in
index 7a74d3b..5031fd6 100644
--- a/src/qs8-dwconv/unipass-wasmsimd-mul16.c.in
+++ b/src/qs8-dwconv/unipass-wasmsimd-mul16.c.in
@@ -14,7 +14,7 @@
 #include <xnnpack/dwconv.h>
 
 
-void xnn_qs8_dwconv_minmax_ukernel_up${CHANNEL_TILE}x${KERNEL_TILE}__wasmsimd_mul16(
+void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up${CHANNEL_TILE}x${KERNEL_TILE}__wasmsimd_mul16(
     size_t channels,
     size_t output_width,
     const int8_t** input,
diff --git a/src/qs8-gemm/1x16c4-minmax-gemmlowp-aarch64-neondot-ld32.S b/src/qs8-gemm/1x16c4-minmax-gemmlowp-aarch64-neondot-ld32.S
index 69669ef..7146da6 100644
--- a/src/qs8-gemm/1x16c4-minmax-gemmlowp-aarch64-neondot-ld32.S
+++ b/src/qs8-gemm/1x16c4-minmax-gemmlowp-aarch64-neondot-ld32.S
@@ -5,7 +5,7 @@
 
 #include <xnnpack/assembly.h>
 
-# void xnn_qs8_gemm_minmax_ukernel_1x16c4__aarch64_neondot_ld32(
+# void xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32(
 #     size_t mr,                 x0
 #     size_t nc,                 x1
 #     size_t kc,                 x2 / x0
@@ -25,7 +25,7 @@
 # C0  x6 v28 v29 v30 v31
 # unused v4 v5 v6 v7 v8 v9 v10 v11 v12 v13 v14 v15
 
-BEGIN_FUNCTION xnn_qs8_gemm_minmax_ukernel_1x16c4__aarch64_neondot_ld32
+BEGIN_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32
 0:
         # Load initial bias from w into accumulators
         ADD     x2, x2, 3               // kc = (kc + 3) & ~3
@@ -111,7 +111,7 @@
 6:
         RET
 
-END_FUNCTION xnn_qs8_gemm_minmax_ukernel_1x16c4__aarch64_neondot_ld32
+END_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32
 
 #ifdef __ELF__
 .section ".note.GNU-stack","",%progbits
diff --git a/src/qs8-gemm/1x16c4-minmax-gemmlowp-aarch64-neondot-ld64.S b/src/qs8-gemm/1x16c4-minmax-gemmlowp-aarch64-neondot-ld64.S
index 7c8b25b..73113db 100644
--- a/src/qs8-gemm/1x16c4-minmax-gemmlowp-aarch64-neondot-ld64.S
+++ b/src/qs8-gemm/1x16c4-minmax-gemmlowp-aarch64-neondot-ld64.S
@@ -5,7 +5,7 @@
 
 #include <xnnpack/assembly.h>
 
-# void xnn_qs8_gemm_minmax_ukernel_1x16c4__aarch64_neondot_ld64(
+# void xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64(
 #     size_t mr,                 x0
 #     size_t nc,                 x1
 #     size_t kc,                 x2 / x0
@@ -25,7 +25,7 @@
 # C0  x6 v28 v29 v30 v31
 # unused v8 v9 v10 v11 v12 v13 v14 v15
 
-BEGIN_FUNCTION xnn_qs8_gemm_minmax_ukernel_1x16c4__aarch64_neondot_ld64
+BEGIN_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64
         ADD     x2, x2, 3               // kc = (kc + 3) & ~3
         BIC     x2, x2, 3
 
@@ -145,7 +145,7 @@
 8:
         RET
 
-END_FUNCTION xnn_qs8_gemm_minmax_ukernel_1x16c4__aarch64_neondot_ld64
+END_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64
 
 #ifdef __ELF__
 .section ".note.GNU-stack","",%progbits
diff --git a/src/qs8-gemm/1x8c8-aarch64-neon-mlal-padal-cortex-a53.S.in b/src/qs8-gemm/1x8c8-aarch64-neon-mlal-padal-cortex-a53.S.in
index 30baf88..2e8ce51 100644
--- a/src/qs8-gemm/1x8c8-aarch64-neon-mlal-padal-cortex-a53.S.in
+++ b/src/qs8-gemm/1x8c8-aarch64-neon-mlal-padal-cortex-a53.S.in
@@ -5,7 +5,7 @@
 
 #include <xnnpack/assembly.h>
 
-# void xnn_qs8_gemm_minmax_ukernel_1x8c8__aarch64_neon_mlal_padal${"_prfm" if PREFETCH else ""}_cortex_a53(
+# void xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal${"_prfm" if PREFETCH else ""}_cortex_a53(
 #     size_t mr,                 x0
 #     size_t nc,                 x1
 #     size_t kc,                 x2 / x0
@@ -27,7 +27,7 @@
 # x16, x17, x7 tenporary a53 gpr load data
 
 
-BEGIN_FUNCTION xnn_qs8_gemm_minmax_ukernel_1x8c8__aarch64_neon_mlal_padal${"_prfm" if PREFETCH else ""}_cortex_a53
+BEGIN_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal${"_prfm" if PREFETCH else ""}_cortex_a53
 
         LDP     x10, x9, [sp]           // cn_stride, params
 
@@ -272,7 +272,7 @@
 8:
         RET
 
-END_FUNCTION xnn_qs8_gemm_minmax_ukernel_1x8c8__aarch64_neon_mlal_padal${"_prfm" if PREFETCH else ""}_cortex_a53
+END_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal${"_prfm" if PREFETCH else ""}_cortex_a53
 
 #ifdef __ELF__
 .section ".note.GNU-stack","",%progbits
diff --git a/src/qs8-gemm/1x8c8-aarch64-neon-mlal-padal.S.in b/src/qs8-gemm/1x8c8-aarch64-neon-mlal-padal.S.in
index df90eff..99f6cb2 100644
--- a/src/qs8-gemm/1x8c8-aarch64-neon-mlal-padal.S.in
+++ b/src/qs8-gemm/1x8c8-aarch64-neon-mlal-padal.S.in
@@ -5,7 +5,7 @@
 
 #include <xnnpack/assembly.h>
 
-# void xnn_qs8_gemm_minmax_ukernel_1x8c8__aarch64_neon_mlal_padal${"_prfm" if PREFETCH else ""}(
+# void xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal${"_prfm" if PREFETCH else ""}(
 #     size_t mr,                 x0
 #     size_t nc,                 x1
 #     size_t kc,                 x2 / x0
@@ -26,7 +26,7 @@
 # temp0  v17 v19 v21 v23
 
 
-BEGIN_FUNCTION xnn_qs8_gemm_minmax_ukernel_1x8c8__aarch64_neon_mlal_padal${"_prfm" if PREFETCH else ""}
+BEGIN_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal${"_prfm" if PREFETCH else ""}
 
         LDP     x10, x9, [sp]           // cn_stride, params
 
@@ -247,7 +247,7 @@
 8:
         RET
 
-END_FUNCTION xnn_qs8_gemm_minmax_ukernel_1x8c8__aarch64_neon_mlal_padal${"_prfm" if PREFETCH else ""}
+END_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal${"_prfm" if PREFETCH else ""}
 
 #ifdef __ELF__
 .section ".note.GNU-stack","",%progbits
diff --git a/src/qs8-gemm/2x8c16-minmax-gemmlowp-aarch64-neon-mlal-padal.S b/src/qs8-gemm/2x8c16-minmax-gemmlowp-aarch64-neon-mlal-padal.S
index 40fe81b..9f8c28b 100644
--- a/src/qs8-gemm/2x8c16-minmax-gemmlowp-aarch64-neon-mlal-padal.S
+++ b/src/qs8-gemm/2x8c16-minmax-gemmlowp-aarch64-neon-mlal-padal.S
@@ -5,7 +5,7 @@
 
 #include <xnnpack/assembly.h>
 
-# void xnn_qs8_gemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal(
+# void xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal(
 #     size_t mr,                 x0
 #     size_t nc,                 x1
 #     size_t kc,                 x2 / x0
@@ -29,7 +29,7 @@
 # temp1   v3 v11 v13 v15
 # unused  v8 v9
 
-BEGIN_FUNCTION xnn_qs8_gemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal
+BEGIN_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal
 
         # Clamp A and C pointers
         CMP     x0, 2                   // if mr < 2
@@ -210,7 +210,7 @@
         LDP     d10, d11, [sp], 48
         RET
 
-END_FUNCTION xnn_qs8_gemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal
+END_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal
 
 #ifdef __ELF__
 .section ".note.GNU-stack","",%progbits
diff --git a/src/qs8-gemm/2x8c8-aarch64-neon-mlal-padal-cortex-a53.S.in b/src/qs8-gemm/2x8c8-aarch64-neon-mlal-padal-cortex-a53.S.in
index 9f47636..f598124 100644
--- a/src/qs8-gemm/2x8c8-aarch64-neon-mlal-padal-cortex-a53.S.in
+++ b/src/qs8-gemm/2x8c8-aarch64-neon-mlal-padal-cortex-a53.S.in
@@ -5,7 +5,7 @@
 
 #include <xnnpack/assembly.h>
 
-# void xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal${"_prfm" if PREFETCH else ""}_cortex_a53(
+# void xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal${"_prfm" if PREFETCH else ""}_cortex_a53(
 #     size_t mr,                 x0
 #     size_t nc,                 x1
 #     size_t kc,                 x2 / x0
@@ -30,7 +30,7 @@
 # x16, x17, x20, x21 tenporary a53 gpr load data
 
 
-BEGIN_FUNCTION xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal${"_prfm" if PREFETCH else ""}_cortex_a53
+BEGIN_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal${"_prfm" if PREFETCH else ""}_cortex_a53
 
         # Clamp A and C pointers
         CMP     x0, 2                   // if mr < 2
@@ -406,7 +406,7 @@
         LDP     d8, d9, [sp], 80
         RET
 
-END_FUNCTION xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal${"_prfm" if PREFETCH else ""}_cortex_a53
+END_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal${"_prfm" if PREFETCH else ""}_cortex_a53
 
 #ifdef __ELF__
 .section ".note.GNU-stack","",%progbits
diff --git a/src/qs8-gemm/2x8c8-aarch64-neon-mlal-padal.S.in b/src/qs8-gemm/2x8c8-aarch64-neon-mlal-padal.S.in
index 408db1a..398aee8 100644
--- a/src/qs8-gemm/2x8c8-aarch64-neon-mlal-padal.S.in
+++ b/src/qs8-gemm/2x8c8-aarch64-neon-mlal-padal.S.in
@@ -5,7 +5,7 @@
 
 #include <xnnpack/assembly.h>
 
-# void xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal(
+# void xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal(
 #     size_t mr,                 x0
 #     size_t nc,                 x1
 #     size_t kc,                 x2 / x0
@@ -29,7 +29,7 @@
 # temp1   v3 v11 v13 v15
 
 
-BEGIN_FUNCTION xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal${"_prfm" if PREFETCH else ""}
+BEGIN_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal${"_prfm" if PREFETCH else ""}
 
         # Clamp A and C pointers
         CMP     x0, 2                   // if mr < 2
@@ -353,7 +353,7 @@
         LDP     d8, d9, [sp], 64
         RET
 
-END_FUNCTION xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal${"_prfm" if PREFETCH else ""}
+END_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal${"_prfm" if PREFETCH else ""}
 
 #ifdef __ELF__
 .section ".note.GNU-stack","",%progbits
diff --git a/src/qs8-gemm/2x8c8-minmax-gemmlowp-aarch64-neon-mull-padal.S b/src/qs8-gemm/2x8c8-minmax-gemmlowp-aarch64-neon-mull-padal.S
index f4e75da..a5a1444 100644
--- a/src/qs8-gemm/2x8c8-minmax-gemmlowp-aarch64-neon-mull-padal.S
+++ b/src/qs8-gemm/2x8c8-minmax-gemmlowp-aarch64-neon-mull-padal.S
@@ -5,7 +5,7 @@
 
 #include <xnnpack/assembly.h>
 
-# void xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mull_padal(
+# void xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull_padal(
 #     size_t mr,                 x0
 #     size_t nc,                 x1
 #     size_t kc,                 x2 / x0
@@ -29,7 +29,7 @@
 # temp1   v3 v11 v13 v15
 # unused  v8 v9
 
-BEGIN_FUNCTION xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mull_padal
+BEGIN_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull_padal
 
         # Clamp A and C pointers
         CMP     x0, 2                   // if mr < 2
@@ -193,7 +193,7 @@
         LDP     d10, d11, [sp], 48
         RET
 
-END_FUNCTION xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mull_padal
+END_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull_padal
 
 #ifdef __ELF__
 .section ".note.GNU-stack","",%progbits
diff --git a/src/qs8-gemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in b/src/qs8-gemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in
index ce92cbf..cb0facf 100644
--- a/src/qs8-gemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in
+++ b/src/qs8-gemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in
@@ -5,7 +5,7 @@
 
 #include <xnnpack/assembly.h>
 
-# void xnn_qs8_gemm_minmax_ukernel_4x16__aarch64_neon_mlal_lane${"_prfm" if PREFETCH else ""}_cortex_a53(
+# void xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16__aarch64_neon_mlal_lane${"_prfm" if PREFETCH else ""}_cortex_a53(
 #     size_t mr,                 x0
 #     size_t nc,                 x1
 #     size_t kc,                 x2 / x0
@@ -34,7 +34,7 @@
 
 # x10 x17 a53 temp registers
 
-BEGIN_FUNCTION xnn_qs8_gemm_minmax_ukernel_4x16__aarch64_neon_mlal_lane${"_prfm" if PREFETCH else ""}_cortex_a53
+BEGIN_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16__aarch64_neon_mlal_lane${"_prfm" if PREFETCH else ""}_cortex_a53
 
         # Clamp A and C pointers
         CMP     x0, 2                   // if mr < 2
@@ -816,7 +816,7 @@
 8:
         RET
 
-END_FUNCTION xnn_qs8_gemm_minmax_ukernel_4x16__aarch64_neon_mlal_lane${"_prfm" if PREFETCH else ""}_cortex_a53
+END_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16__aarch64_neon_mlal_lane${"_prfm" if PREFETCH else ""}_cortex_a53
 
 #ifdef __ELF__
 .section ".note.GNU-stack","",%progbits
diff --git a/src/qs8-gemm/4x16c4-minmax-gemmlowp-aarch64-neondot-cortex-a55.S b/src/qs8-gemm/4x16c4-minmax-gemmlowp-aarch64-neondot-cortex-a55.S
index 5b3e213..811cd5c 100644
--- a/src/qs8-gemm/4x16c4-minmax-gemmlowp-aarch64-neondot-cortex-a55.S
+++ b/src/qs8-gemm/4x16c4-minmax-gemmlowp-aarch64-neondot-cortex-a55.S
@@ -5,7 +5,7 @@
 
 #include <xnnpack/assembly.h>
 
-# void xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_cortex_a55(
+# void xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55(
 #     size_t mr,                 x0
 #     size_t nc,                 x1
 #     size_t kc,                 x2 / x0
@@ -31,7 +31,7 @@
 # C3  x7 v19 v23 v27 v31
 # unused v12 v13 v14 v15
 
-BEGIN_FUNCTION xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_cortex_a55
+BEGIN_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55
 
         # Clamp A and C pointers
         CMP     x0, 2                   // if mr < 2
@@ -658,7 +658,7 @@
         LDP     d8,  d9, [sp], 32
         RET
 
-END_FUNCTION xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_cortex_a55
+END_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55
 
 #ifdef __ELF__
 .section ".note.GNU-stack","",%progbits
diff --git a/src/qs8-gemm/4x16c4-minmax-gemmlowp-aarch64-neondot-ld32.S b/src/qs8-gemm/4x16c4-minmax-gemmlowp-aarch64-neondot-ld32.S
index e92e6fe..169d7eb 100644
--- a/src/qs8-gemm/4x16c4-minmax-gemmlowp-aarch64-neondot-ld32.S
+++ b/src/qs8-gemm/4x16c4-minmax-gemmlowp-aarch64-neondot-ld32.S
@@ -5,7 +5,7 @@
 
 #include <xnnpack/assembly.h>
 
-# void xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_ld32(
+# void xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32(
 #     size_t mr,                 x0
 #     size_t nc,                 x1
 #     size_t kc,                 x2 / x0
@@ -31,7 +31,7 @@
 # C3  x7 v19 v23 v27 v31
 # unused v8 v9 v10 v11 v12 v13 v14 v15
 
-BEGIN_FUNCTION xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_ld32
+BEGIN_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32
 
         # Clamp A and C pointers
         CMP     x0, 2                   // if mr < 2
@@ -289,7 +289,7 @@
 6:
         RET
 
-END_FUNCTION xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_ld32
+END_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32
 
 #ifdef __ELF__
 .section ".note.GNU-stack","",%progbits
diff --git a/src/qs8-gemm/4x16c4-minmax-gemmlowp-aarch64-neondot-ld64.S b/src/qs8-gemm/4x16c4-minmax-gemmlowp-aarch64-neondot-ld64.S
index bf65995..efb6234 100644
--- a/src/qs8-gemm/4x16c4-minmax-gemmlowp-aarch64-neondot-ld64.S
+++ b/src/qs8-gemm/4x16c4-minmax-gemmlowp-aarch64-neondot-ld64.S
@@ -5,7 +5,7 @@
 
 #include <xnnpack/assembly.h>
 
-# void xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_ld64(
+# void xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64(
 #     size_t mr,                 x0
 #     size_t nc,                 x1
 #     size_t kc,                 x2 / x0
@@ -31,7 +31,7 @@
 # C3  x7 v19 v23 v27 v31
 # unused v8 v9 v10 v11 v12 v13 v14 v15
 
-BEGIN_FUNCTION xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_ld64
+BEGIN_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64
 
         # Clamp A and C pointers
         CMP     x0, 2                   // if mr < 2
@@ -340,7 +340,7 @@
 8:
         RET
 
-END_FUNCTION xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_ld64
+END_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64
 
 #ifdef __ELF__
 .section ".note.GNU-stack","",%progbits
diff --git a/src/qs8-gemm/MRx16c8-avx512skx.c.in b/src/qs8-gemm/MRx16c8-avx512skx.c.in
index 7870b6c..12e073f 100644
--- a/src/qs8-gemm/MRx16c8-avx512skx.c.in
+++ b/src/qs8-gemm/MRx16c8-avx512skx.c.in
@@ -16,7 +16,7 @@
 
 
 $GEMM_SUFFIX = "_xw" if VARIANT == "EXTENDED" else ""
-void xnn_qs8_gemm${GEMM_SUFFIX}_minmax_ukernel_${MR}x16c8__avx512skx(
+void xnn_qs8_gemm${GEMM_SUFFIX}_minmax_gemmlowp_ukernel_${MR}x16c8__avx512skx(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/MRx4c2-sse.c.in b/src/qs8-gemm/MRx4c2-sse.c.in
index 7669bfa..e13c6f2 100644
--- a/src/qs8-gemm/MRx4c2-sse.c.in
+++ b/src/qs8-gemm/MRx4c2-sse.c.in
@@ -29,7 +29,7 @@
 $GEMM_SUFFIX = "_xw" if VARIANT == "EXTENDED" else ""
 $PARAMS_STRUCT = "sse4" if SSE >= 4 else "sse2"
 $ISA = "xop" if XOP else "avx" if AVX else {2: "sse2", 3: "ssse3", 4: "sse41"}[SSE]
-void xnn_qs8_gemm${GEMM_SUFFIX}_minmax_ukernel_${MR}x4c2__${ISA}${LOAD_SUFFIX}(
+void xnn_qs8_gemm${GEMM_SUFFIX}_minmax_gemmlowp_ukernel_${MR}x4c2__${ISA}${LOAD_SUFFIX}(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/MRx4c8-sse.c.in b/src/qs8-gemm/MRx4c8-sse.c.in
index 0f2ed51..f06aed9 100644
--- a/src/qs8-gemm/MRx4c8-sse.c.in
+++ b/src/qs8-gemm/MRx4c8-sse.c.in
@@ -29,7 +29,7 @@
 $GEMM_SUFFIX = "_xw" if VARIANT == "EXTENDED" else ""
 $PARAMS_STRUCT = "sse4" if SSE >= 4 else "sse2"
 $ISA = "xop" if XOP else "avx" if AVX else {2: "sse2", 3: "ssse3", 4: "sse41"}[SSE]
-void xnn_qs8_gemm${GEMM_SUFFIX}_minmax_ukernel_${MR}x4c8__${ISA}${LOAD_SUFFIX}(
+void xnn_qs8_gemm${GEMM_SUFFIX}_minmax_gemmlowp_ukernel_${MR}x4c8__${ISA}${LOAD_SUFFIX}(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/MRx4c8-wasmsimd.c.in b/src/qs8-gemm/MRx4c8-wasmsimd.c.in
index 4fced4a..46c1b72 100644
--- a/src/qs8-gemm/MRx4c8-wasmsimd.c.in
+++ b/src/qs8-gemm/MRx4c8-wasmsimd.c.in
@@ -15,7 +15,7 @@
 
 $LOAD_SUFFIX = {"LD128": "_ld128", "LD64": "_ld64", "EXTENDED": ""}[VARIANT]
 $GEMM_SUFFIX = "_xw" if VARIANT == "EXTENDED" else ""
-void xnn_qs8_gemm${GEMM_SUFFIX}_minmax_ukernel_${MR}x4c8__wasmsimd${LOAD_SUFFIX}(
+void xnn_qs8_gemm${GEMM_SUFFIX}_minmax_gemmlowp_ukernel_${MR}x4c8__wasmsimd${LOAD_SUFFIX}(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/MRx8c8-avx2.c.in b/src/qs8-gemm/MRx8c8-avx2.c.in
index bac1ce2..20e4af5 100644
--- a/src/qs8-gemm/MRx8c8-avx2.c.in
+++ b/src/qs8-gemm/MRx8c8-avx2.c.in
@@ -15,7 +15,7 @@
 
 
 $GEMM_SUFFIX = "_xw" if VARIANT == "EXTENDED" else ""
-void xnn_qs8_gemm${GEMM_SUFFIX}_minmax_ukernel_${MR}x8c8__avx2(
+void xnn_qs8_gemm${GEMM_SUFFIX}_minmax_gemmlowp_ukernel_${MR}x8c8__avx2(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/MRxNRc4-neondot.c.in b/src/qs8-gemm/MRxNRc4-neondot.c.in
index b037bd8..dded8ae 100644
--- a/src/qs8-gemm/MRxNRc4-neondot.c.in
+++ b/src/qs8-gemm/MRxNRc4-neondot.c.in
@@ -14,7 +14,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_${MR}x${NR}c4__neondot(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_${MR}x${NR}c4__neondot(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/c16-neon-mlal-padal.c.in b/src/qs8-gemm/c16-neon-mlal-padal.c.in
index 15f28b7..cb7ae46 100644
--- a/src/qs8-gemm/c16-neon-mlal-padal.c.in
+++ b/src/qs8-gemm/c16-neon-mlal-padal.c.in
@@ -14,7 +14,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_${MR}x${NR}c16__neon_mlal_padal(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_${MR}x${NR}c16__neon_mlal_padal(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/c2-neon-mull-padal-dup.c.in b/src/qs8-gemm/c2-neon-mull-padal-dup.c.in
index f9281fa..235d9e0 100644
--- a/src/qs8-gemm/c2-neon-mull-padal-dup.c.in
+++ b/src/qs8-gemm/c2-neon-mull-padal-dup.c.in
@@ -14,7 +14,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_${MR}x${NR}c2__neon_${"mlal" if MLA else "mull"}_padal_dup(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_${MR}x${NR}c2__neon_${"mlal" if MLA else "mull"}_padal_dup(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/c8-neon-mull-padal.c.in b/src/qs8-gemm/c8-neon-mull-padal.c.in
index 3a39bdf..3895a74 100644
--- a/src/qs8-gemm/c8-neon-mull-padal.c.in
+++ b/src/qs8-gemm/c8-neon-mull-padal.c.in
@@ -14,7 +14,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_${MR}x${NR}c8__neon_${"mlal" if MLA else "mull"}_padal(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_${MR}x${NR}c8__neon_${"mlal" if MLA else "mull"}_padal(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/1x16-minmax-gemmlowp-neon-mlal-lane-prfm.c b/src/qs8-gemm/gen/1x16-minmax-gemmlowp-neon-mlal-lane-prfm.c
index 884d3d9..cd26fa5 100644
--- a/src/qs8-gemm/gen/1x16-minmax-gemmlowp-neon-mlal-lane-prfm.c
+++ b/src/qs8-gemm/gen/1x16-minmax-gemmlowp-neon-mlal-lane-prfm.c
@@ -15,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane_prfm(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16__neon_mlal_lane_prfm(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/1x16-minmax-gemmlowp-neon-mlal-lane.c b/src/qs8-gemm/gen/1x16-minmax-gemmlowp-neon-mlal-lane.c
index 72f6a33..bb363e9 100644
--- a/src/qs8-gemm/gen/1x16-minmax-gemmlowp-neon-mlal-lane.c
+++ b/src/qs8-gemm/gen/1x16-minmax-gemmlowp-neon-mlal-lane.c
@@ -15,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16__neon_mlal_lane(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/1x16-minmax-gemmlowp-neon-mull-addw-dup.c b/src/qs8-gemm/gen/1x16-minmax-gemmlowp-neon-mull-addw-dup.c
index 0d1e159..319a9d0 100644
--- a/src/qs8-gemm/gen/1x16-minmax-gemmlowp-neon-mull-addw-dup.c
+++ b/src/qs8-gemm/gen/1x16-minmax-gemmlowp-neon-mull-addw-dup.c
@@ -15,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16__neon_mull_addw_dup(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/1x16c16-minmax-gemmlowp-neon-mlal-padal.c b/src/qs8-gemm/gen/1x16c16-minmax-gemmlowp-neon-mlal-padal.c
index 8777d3d..1735cc8 100644
--- a/src/qs8-gemm/gen/1x16c16-minmax-gemmlowp-neon-mlal-padal.c
+++ b/src/qs8-gemm/gen/1x16c16-minmax-gemmlowp-neon-mlal-padal.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal_padal(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/1x16c2-minmax-gemmlowp-neon-mlal-padal-dup.c b/src/qs8-gemm/gen/1x16c2-minmax-gemmlowp-neon-mlal-padal-dup.c
index 7a60c35..262b710 100644
--- a/src/qs8-gemm/gen/1x16c2-minmax-gemmlowp-neon-mlal-padal-dup.c
+++ b/src/qs8-gemm/gen/1x16c2-minmax-gemmlowp-neon-mlal-padal-dup.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c2__neon_mlal_padal_dup(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/1x16c2-minmax-gemmlowp-neon-mull-padal-dup.c b/src/qs8-gemm/gen/1x16c2-minmax-gemmlowp-neon-mull-padal-dup.c
index 6ab52c5..407cc9b 100644
--- a/src/qs8-gemm/gen/1x16c2-minmax-gemmlowp-neon-mull-padal-dup.c
+++ b/src/qs8-gemm/gen/1x16c2-minmax-gemmlowp-neon-mull-padal-dup.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c2__neon_mull_padal_dup(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/1x16c4-minmax-gemmlowp-neondot.c b/src/qs8-gemm/gen/1x16c4-minmax-gemmlowp-neondot.c
index f73dd33..9439336 100644
--- a/src/qs8-gemm/gen/1x16c4-minmax-gemmlowp-neondot.c
+++ b/src/qs8-gemm/gen/1x16c4-minmax-gemmlowp-neondot.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_1x16c4__neondot(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__neondot(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/1x16c8-minmax-gemmlowp-avx512skx.c b/src/qs8-gemm/gen/1x16c8-minmax-gemmlowp-avx512skx.c
index 7729f91..08fff83 100644
--- a/src/qs8-gemm/gen/1x16c8-minmax-gemmlowp-avx512skx.c
+++ b/src/qs8-gemm/gen/1x16c8-minmax-gemmlowp-avx512skx.c
@@ -16,7 +16,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_1x16c8__avx512skx(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__avx512skx(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/1x16c8-minmax-gemmlowp-neon-mlal-padal.c b/src/qs8-gemm/gen/1x16c8-minmax-gemmlowp-neon-mlal-padal.c
index a45769e..f2610c8 100644
--- a/src/qs8-gemm/gen/1x16c8-minmax-gemmlowp-neon-mlal-padal.c
+++ b/src/qs8-gemm/gen/1x16c8-minmax-gemmlowp-neon-mlal-padal.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal_padal(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/1x16c8-minmax-gemmlowp-neon-mull-padal.c b/src/qs8-gemm/gen/1x16c8-minmax-gemmlowp-neon-mull-padal.c
index ac60087..540f4ee 100644
--- a/src/qs8-gemm/gen/1x16c8-minmax-gemmlowp-neon-mull-padal.c
+++ b/src/qs8-gemm/gen/1x16c8-minmax-gemmlowp-neon-mull-padal.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull_padal(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/1x2-minmax-gemmlowp-scalar.c b/src/qs8-gemm/gen/1x2-minmax-gemmlowp-scalar.c
index e206a4b..4454ec1 100644
--- a/src/qs8-gemm/gen/1x2-minmax-gemmlowp-scalar.c
+++ b/src/qs8-gemm/gen/1x2-minmax-gemmlowp-scalar.c
@@ -14,7 +14,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_1x2__scalar(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x2__scalar(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/1x4-minmax-gemmlowp-scalar.c b/src/qs8-gemm/gen/1x4-minmax-gemmlowp-scalar.c
index 731650e..58ce276 100644
--- a/src/qs8-gemm/gen/1x4-minmax-gemmlowp-scalar.c
+++ b/src/qs8-gemm/gen/1x4-minmax-gemmlowp-scalar.c
@@ -14,7 +14,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_1x4__scalar(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x4__scalar(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-avx-ld128.c b/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-avx-ld128.c
index 8350dc1..2601b09 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-avx-ld128.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-avx-ld128.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld128(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x4c2__avx_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-avx-ld64.c b/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-avx-ld64.c
index 63362ee..d3e3979 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-avx-ld64.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-avx-ld64.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld64(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x4c2__avx_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-sse2-ld128.c b/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-sse2-ld128.c
index 7df288a..d7f76e3 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-sse2-ld128.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-sse2-ld128.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_1x4c2__sse2_ld128(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x4c2__sse2_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-sse2-ld64.c b/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-sse2-ld64.c
index d809fb1..d4703ba 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-sse2-ld64.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-sse2-ld64.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_1x4c2__sse2_ld64(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x4c2__sse2_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-sse41-ld128.c b/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-sse41-ld128.c
index 6b57e90..19ea8d5 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-sse41-ld128.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-sse41-ld128.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld128(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x4c2__sse41_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-sse41-ld64.c b/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-sse41-ld64.c
index 5a75904..e8112b9 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-sse41-ld64.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-sse41-ld64.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld64(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x4c2__sse41_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-ssse3-ld128.c b/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-ssse3-ld128.c
index b3ea64b..cdcd535 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-ssse3-ld128.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-ssse3-ld128.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_1x4c2__ssse3_ld128(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x4c2__ssse3_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-ssse3-ld64.c b/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-ssse3-ld64.c
index 1401dc4..e24e6cc 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-ssse3-ld64.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-ssse3-ld64.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_1x4c2__ssse3_ld64(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x4c2__ssse3_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-xop-ld128.c b/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-xop-ld128.c
index 227d0ab..b92097b 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-xop-ld128.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-xop-ld128.c
@@ -20,7 +20,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld128(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x4c2__xop_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-xop-ld64.c b/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-xop-ld64.c
index 1a3f597..1d4e12c 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-xop-ld64.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-xop-ld64.c
@@ -20,7 +20,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld64(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x4c2__xop_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-avx.c b/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-avx.c
index 4c04278..e3e799e 100644
--- a/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-avx.c
+++ b/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-avx.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__avx(
+void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__avx(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-sse2.c b/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-sse2.c
index 299d609..1954255 100644
--- a/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-sse2.c
+++ b/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-sse2.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse2(
+void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse2(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-sse41.c b/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-sse41.c
index 975d67c..e3fff33 100644
--- a/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-sse41.c
+++ b/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-sse41.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse41(
+void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse41(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-ssse3.c b/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-ssse3.c
index 1b56035..ab14a17 100644
--- a/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-ssse3.c
+++ b/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-ssse3.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__ssse3(
+void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__ssse3(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-xop.c b/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-xop.c
index 5565473..9e37aeb 100644
--- a/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-xop.c
+++ b/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-xop.c
@@ -20,7 +20,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__xop(
+void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__xop(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-avx-ld128.c b/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-avx-ld128.c
index 9f3ee44..7154798 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-avx-ld128.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-avx-ld128.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld128(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x4c8__avx_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-avx-ld64.c b/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-avx-ld64.c
index d537eeb..884a744 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-avx-ld64.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-avx-ld64.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld64(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x4c8__avx_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-sse2-ld128.c b/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-sse2-ld128.c
index 6370331..0a3c9bf 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-sse2-ld128.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-sse2-ld128.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_1x4c8__sse2_ld128(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x4c8__sse2_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-sse2-ld64.c b/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-sse2-ld64.c
index 89adced..d824489 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-sse2-ld64.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-sse2-ld64.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_1x4c8__sse2_ld64(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x4c8__sse2_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-sse41-ld128.c b/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-sse41-ld128.c
index a68701d..eec86d2 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-sse41-ld128.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-sse41-ld128.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x4c8__sse41_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-sse41-ld64.c b/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-sse41-ld64.c
index 17a890e..e8f39f1 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-sse41-ld64.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-sse41-ld64.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld64(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x4c8__sse41_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-ssse3-ld128.c b/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-ssse3-ld128.c
index 33104b1..ab3766d 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-ssse3-ld128.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-ssse3-ld128.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_1x4c8__ssse3_ld128(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x4c8__ssse3_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-ssse3-ld64.c b/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-ssse3-ld64.c
index 535b406..b2f0dd1 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-ssse3-ld64.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-ssse3-ld64.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_1x4c8__ssse3_ld64(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x4c8__ssse3_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-wasmsimd-ld128.c b/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-wasmsimd-ld128.c
index 358cb92..f5b73f0 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-wasmsimd-ld128.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-wasmsimd-ld128.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_1x4c8__wasmsimd_ld128(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x4c8__wasmsimd_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-wasmsimd-ld64.c b/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-wasmsimd-ld64.c
index 7cd2050..96d437c 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-wasmsimd-ld64.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-wasmsimd-ld64.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_1x4c8__wasmsimd_ld64(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x4c8__wasmsimd_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-xop-ld128.c b/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-xop-ld128.c
index 34e3cb0..a8f1eb7 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-xop-ld128.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-xop-ld128.c
@@ -20,7 +20,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x4c8__xop_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-xop-ld64.c b/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-xop-ld64.c
index ebcc4aa..0a3b7b1 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-xop-ld64.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-xop-ld64.c
@@ -20,7 +20,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld64(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x4c8__xop_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-avx.c b/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-avx.c
index 6d4827c..608678e 100644
--- a/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-avx.c
+++ b/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-avx.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__avx(
+void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__avx(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-sse2.c b/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-sse2.c
index c5da5d5..8304817 100644
--- a/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-sse2.c
+++ b/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-sse2.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse2(
+void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse2(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-sse41.c b/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-sse41.c
index c0ee9fd..cbf6c84 100644
--- a/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-sse41.c
+++ b/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-sse41.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse41(
+void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse41(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-ssse3.c b/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-ssse3.c
index 9967e0f..a1a6121 100644
--- a/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-ssse3.c
+++ b/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-ssse3.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__ssse3(
+void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__ssse3(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-wasmsimd.c b/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-wasmsimd.c
index 6983219..2984f88 100644
--- a/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-wasmsimd.c
+++ b/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-wasmsimd.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__wasmsimd(
+void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__wasmsimd(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-xop.c b/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-xop.c
index 44bc8cc..afa66cd 100644
--- a/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-xop.c
+++ b/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-xop.c
@@ -20,7 +20,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__xop(
+void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__xop(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/1x8-minmax-gemmlowp-neon-mlal-lane-prfm.c b/src/qs8-gemm/gen/1x8-minmax-gemmlowp-neon-mlal-lane-prfm.c
index 4a0861d..9279a2f 100644
--- a/src/qs8-gemm/gen/1x8-minmax-gemmlowp-neon-mlal-lane-prfm.c
+++ b/src/qs8-gemm/gen/1x8-minmax-gemmlowp-neon-mlal-lane-prfm.c
@@ -15,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_1x8__neon_mlal_lane_prfm(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8__neon_mlal_lane_prfm(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/1x8-minmax-gemmlowp-neon-mlal-lane.c b/src/qs8-gemm/gen/1x8-minmax-gemmlowp-neon-mlal-lane.c
index a00f139..0039d94 100644
--- a/src/qs8-gemm/gen/1x8-minmax-gemmlowp-neon-mlal-lane.c
+++ b/src/qs8-gemm/gen/1x8-minmax-gemmlowp-neon-mlal-lane.c
@@ -15,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_1x8__neon_mlal_lane(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8__neon_mlal_lane(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/1x8-minmax-gemmlowp-neon-mull-addw-dup.c b/src/qs8-gemm/gen/1x8-minmax-gemmlowp-neon-mull-addw-dup.c
index 2e98ffc..61a756b 100644
--- a/src/qs8-gemm/gen/1x8-minmax-gemmlowp-neon-mull-addw-dup.c
+++ b/src/qs8-gemm/gen/1x8-minmax-gemmlowp-neon-mull-addw-dup.c
@@ -15,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_1x8__neon_mull_addw_dup(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8__neon_mull_addw_dup(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/1x8c16-minmax-gemmlowp-neon-mlal-padal.c b/src/qs8-gemm/gen/1x8c16-minmax-gemmlowp-neon-mlal-padal.c
index 41d2ce5..206e8a0 100644
--- a/src/qs8-gemm/gen/1x8c16-minmax-gemmlowp-neon-mlal-padal.c
+++ b/src/qs8-gemm/gen/1x8c16-minmax-gemmlowp-neon-mlal-padal.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_1x8c16__neon_mlal_padal(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal_padal(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/1x8c2-minmax-gemmlowp-neon-mlal-padal-dup.c b/src/qs8-gemm/gen/1x8c2-minmax-gemmlowp-neon-mlal-padal-dup.c
index 02257a1..b830974 100644
--- a/src/qs8-gemm/gen/1x8c2-minmax-gemmlowp-neon-mlal-padal-dup.c
+++ b/src/qs8-gemm/gen/1x8c2-minmax-gemmlowp-neon-mlal-padal-dup.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_padal_dup(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/1x8c2-minmax-gemmlowp-neon-mull-padal-dup.c b/src/qs8-gemm/gen/1x8c2-minmax-gemmlowp-neon-mull-padal-dup.c
index d5f3247..08c447a 100644
--- a/src/qs8-gemm/gen/1x8c2-minmax-gemmlowp-neon-mull-padal-dup.c
+++ b/src/qs8-gemm/gen/1x8c2-minmax-gemmlowp-neon-mull-padal-dup.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mull_padal_dup(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c2__neon_mull_padal_dup(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/1x8c4-minmax-gemmlowp-neondot.c b/src/qs8-gemm/gen/1x8c4-minmax-gemmlowp-neondot.c
index 7c6318d..88822ea 100644
--- a/src/qs8-gemm/gen/1x8c4-minmax-gemmlowp-neondot.c
+++ b/src/qs8-gemm/gen/1x8c4-minmax-gemmlowp-neondot.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_1x8c4__neondot(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c4__neondot(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-cortex-a53.S b/src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-cortex-a53.S
index a7f167c..453eb25 100644
--- a/src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-cortex-a53.S
+++ b/src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-cortex-a53.S
@@ -9,7 +9,7 @@
 
 #include <xnnpack/assembly.h>
 
-# void xnn_qs8_gemm_minmax_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53(
+# void xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53(
 #     size_t mr,                 x0
 #     size_t nc,                 x1
 #     size_t kc,                 x2 / x0
@@ -31,7 +31,7 @@
 # x16, x17, x7 tenporary a53 gpr load data
 
 
-BEGIN_FUNCTION xnn_qs8_gemm_minmax_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53
+BEGIN_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53
 
         LDP     x10, x9, [sp]           // cn_stride, params
 
@@ -270,7 +270,7 @@
 8:
         RET
 
-END_FUNCTION xnn_qs8_gemm_minmax_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53
+END_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53
 
 #ifdef __ELF__
 .section ".note.GNU-stack","",%progbits
diff --git a/src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm-cortex-a53.S b/src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm-cortex-a53.S
index 3491d88..587c39c 100644
--- a/src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm-cortex-a53.S
+++ b/src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm-cortex-a53.S
@@ -9,7 +9,7 @@
 
 #include <xnnpack/assembly.h>
 
-# void xnn_qs8_gemm_minmax_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53(
+# void xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53(
 #     size_t mr,                 x0
 #     size_t nc,                 x1
 #     size_t kc,                 x2 / x0
@@ -31,7 +31,7 @@
 # x16, x17, x7 tenporary a53 gpr load data
 
 
-BEGIN_FUNCTION xnn_qs8_gemm_minmax_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53
+BEGIN_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53
 
         LDP     x10, x9, [sp]           // cn_stride, params
 
@@ -273,7 +273,7 @@
 8:
         RET
 
-END_FUNCTION xnn_qs8_gemm_minmax_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53
+END_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53
 
 #ifdef __ELF__
 .section ".note.GNU-stack","",%progbits
diff --git a/src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm.S b/src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm.S
index 120e13f..c876f1b 100644
--- a/src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm.S
+++ b/src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm.S
@@ -9,7 +9,7 @@
 
 #include <xnnpack/assembly.h>
 
-# void xnn_qs8_gemm_minmax_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm(
+# void xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm(
 #     size_t mr,                 x0
 #     size_t nc,                 x1
 #     size_t kc,                 x2 / x0
@@ -30,7 +30,7 @@
 # temp0  v17 v19 v21 v23
 
 
-BEGIN_FUNCTION xnn_qs8_gemm_minmax_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm
+BEGIN_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm
 
         LDP     x10, x9, [sp]           // cn_stride, params
 
@@ -245,7 +245,7 @@
 8:
         RET
 
-END_FUNCTION xnn_qs8_gemm_minmax_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm
+END_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm
 
 #ifdef __ELF__
 .section ".note.GNU-stack","",%progbits
diff --git a/src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal.S b/src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal.S
index 5084b31..d835538 100644
--- a/src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal.S
+++ b/src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal.S
@@ -9,7 +9,7 @@
 
 #include <xnnpack/assembly.h>
 
-# void xnn_qs8_gemm_minmax_ukernel_1x8c8__aarch64_neon_mlal_padal(
+# void xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal(
 #     size_t mr,                 x0
 #     size_t nc,                 x1
 #     size_t kc,                 x2 / x0
@@ -30,7 +30,7 @@
 # temp0  v17 v19 v21 v23
 
 
-BEGIN_FUNCTION xnn_qs8_gemm_minmax_ukernel_1x8c8__aarch64_neon_mlal_padal
+BEGIN_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal
 
         LDP     x10, x9, [sp]           // cn_stride, params
 
@@ -239,7 +239,7 @@
 8:
         RET
 
-END_FUNCTION xnn_qs8_gemm_minmax_ukernel_1x8c8__aarch64_neon_mlal_padal
+END_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal
 
 #ifdef __ELF__
 .section ".note.GNU-stack","",%progbits
diff --git a/src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-avx2.c b/src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-avx2.c
index 5f0a77c..44c0264 100644
--- a/src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-avx2.c
+++ b/src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-avx2.c
@@ -16,7 +16,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__avx2(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-neon-mlal-padal.c b/src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-neon-mlal-padal.c
index ed094a7..b086df7 100644
--- a/src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-neon-mlal-padal.c
+++ b/src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-neon-mlal-padal.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mlal_padal(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal_padal(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-neon-mull-padal.c b/src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-neon-mull-padal.c
index 971d6a4..c97a2dc 100644
--- a/src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-neon-mull-padal.c
+++ b/src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-neon-mull-padal.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mull_padal(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull_padal(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/1x8c8-xw-minmax-gemmlowp-avx2.c b/src/qs8-gemm/gen/1x8c8-xw-minmax-gemmlowp-avx2.c
index 69b6b7c..9146b41 100644
--- a/src/qs8-gemm/gen/1x8c8-xw-minmax-gemmlowp-avx2.c
+++ b/src/qs8-gemm/gen/1x8c8-xw-minmax-gemmlowp-avx2.c
@@ -16,7 +16,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2(
+void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x8c8__avx2(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/2x16-minmax-gemmlowp-neon-mlal-lane-prfm.c b/src/qs8-gemm/gen/2x16-minmax-gemmlowp-neon-mlal-lane-prfm.c
index 49580f8..9128db9 100644
--- a/src/qs8-gemm/gen/2x16-minmax-gemmlowp-neon-mlal-lane-prfm.c
+++ b/src/qs8-gemm/gen/2x16-minmax-gemmlowp-neon-mlal-lane-prfm.c
@@ -15,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane_prfm(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16__neon_mlal_lane_prfm(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/2x16-minmax-gemmlowp-neon-mlal-lane.c b/src/qs8-gemm/gen/2x16-minmax-gemmlowp-neon-mlal-lane.c
index b7d4071..d883ca9 100644
--- a/src/qs8-gemm/gen/2x16-minmax-gemmlowp-neon-mlal-lane.c
+++ b/src/qs8-gemm/gen/2x16-minmax-gemmlowp-neon-mlal-lane.c
@@ -15,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16__neon_mlal_lane(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/2x16-minmax-gemmlowp-neon-mull-addw-dup.c b/src/qs8-gemm/gen/2x16-minmax-gemmlowp-neon-mull-addw-dup.c
index 5850903..cab6ad6 100644
--- a/src/qs8-gemm/gen/2x16-minmax-gemmlowp-neon-mull-addw-dup.c
+++ b/src/qs8-gemm/gen/2x16-minmax-gemmlowp-neon-mull-addw-dup.c
@@ -15,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16__neon_mull_addw_dup(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/2x16c16-minmax-gemmlowp-neon-mlal-padal.c b/src/qs8-gemm/gen/2x16c16-minmax-gemmlowp-neon-mlal-padal.c
index eafacd8..6ddcd89 100644
--- a/src/qs8-gemm/gen/2x16c16-minmax-gemmlowp-neon-mlal-padal.c
+++ b/src/qs8-gemm/gen/2x16c16-minmax-gemmlowp-neon-mlal-padal.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal_padal(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/2x16c2-minmax-gemmlowp-neon-mlal-padal-dup.c b/src/qs8-gemm/gen/2x16c2-minmax-gemmlowp-neon-mlal-padal-dup.c
index 20959ce..3b33cc3 100644
--- a/src/qs8-gemm/gen/2x16c2-minmax-gemmlowp-neon-mlal-padal-dup.c
+++ b/src/qs8-gemm/gen/2x16c2-minmax-gemmlowp-neon-mlal-padal-dup.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c2__neon_mlal_padal_dup(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/2x16c2-minmax-gemmlowp-neon-mull-padal-dup.c b/src/qs8-gemm/gen/2x16c2-minmax-gemmlowp-neon-mull-padal-dup.c
index a306ef6..8d2862d 100644
--- a/src/qs8-gemm/gen/2x16c2-minmax-gemmlowp-neon-mull-padal-dup.c
+++ b/src/qs8-gemm/gen/2x16c2-minmax-gemmlowp-neon-mull-padal-dup.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c2__neon_mull_padal_dup(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/2x16c8-minmax-gemmlowp-avx512skx.c b/src/qs8-gemm/gen/2x16c8-minmax-gemmlowp-avx512skx.c
index cab5fd0..c437aea 100644
--- a/src/qs8-gemm/gen/2x16c8-minmax-gemmlowp-avx512skx.c
+++ b/src/qs8-gemm/gen/2x16c8-minmax-gemmlowp-avx512skx.c
@@ -16,7 +16,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_2x16c8__avx512skx(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__avx512skx(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/2x16c8-minmax-gemmlowp-neon-mlal-padal.c b/src/qs8-gemm/gen/2x16c8-minmax-gemmlowp-neon-mlal-padal.c
index de9c3ea..a766207 100644
--- a/src/qs8-gemm/gen/2x16c8-minmax-gemmlowp-neon-mlal-padal.c
+++ b/src/qs8-gemm/gen/2x16c8-minmax-gemmlowp-neon-mlal-padal.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal_padal(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/2x16c8-minmax-gemmlowp-neon-mull-padal.c b/src/qs8-gemm/gen/2x16c8-minmax-gemmlowp-neon-mull-padal.c
index 84a40e3..c96eaec 100644
--- a/src/qs8-gemm/gen/2x16c8-minmax-gemmlowp-neon-mull-padal.c
+++ b/src/qs8-gemm/gen/2x16c8-minmax-gemmlowp-neon-mull-padal.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull_padal(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/2x2-minmax-gemmlowp-scalar.c b/src/qs8-gemm/gen/2x2-minmax-gemmlowp-scalar.c
index 523cf90..9f4d9b4 100644
--- a/src/qs8-gemm/gen/2x2-minmax-gemmlowp-scalar.c
+++ b/src/qs8-gemm/gen/2x2-minmax-gemmlowp-scalar.c
@@ -14,7 +14,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_2x2__scalar(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x2__scalar(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/2x4-minmax-gemmlowp-scalar.c b/src/qs8-gemm/gen/2x4-minmax-gemmlowp-scalar.c
index 48188eb..01f2ab8 100644
--- a/src/qs8-gemm/gen/2x4-minmax-gemmlowp-scalar.c
+++ b/src/qs8-gemm/gen/2x4-minmax-gemmlowp-scalar.c
@@ -14,7 +14,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_2x4__scalar(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4__scalar(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-avx-ld128.c b/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-avx-ld128.c
index a6bb8cf..e9b0921 100644
--- a/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-avx-ld128.c
+++ b/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-avx-ld128.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld128(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c2__avx_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-avx-ld64.c b/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-avx-ld64.c
index 2b72822..d9fde6f 100644
--- a/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-avx-ld64.c
+++ b/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-avx-ld64.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld64(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c2__avx_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-sse2-ld128.c b/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-sse2-ld128.c
index c4ade3b..21bac85 100644
--- a/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-sse2-ld128.c
+++ b/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-sse2-ld128.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_2x4c2__sse2_ld128(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c2__sse2_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-sse2-ld64.c b/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-sse2-ld64.c
index 672ce4a..f0f634f 100644
--- a/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-sse2-ld64.c
+++ b/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-sse2-ld64.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_2x4c2__sse2_ld64(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c2__sse2_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-sse41-ld128.c b/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-sse41-ld128.c
index 60a7aa1..5b3d4ba 100644
--- a/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-sse41-ld128.c
+++ b/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-sse41-ld128.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld128(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c2__sse41_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-sse41-ld64.c b/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-sse41-ld64.c
index 7f5ba39..07bedc7 100644
--- a/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-sse41-ld64.c
+++ b/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-sse41-ld64.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld64(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c2__sse41_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-ssse3-ld128.c b/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-ssse3-ld128.c
index 53bf77c..be538ab 100644
--- a/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-ssse3-ld128.c
+++ b/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-ssse3-ld128.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_2x4c2__ssse3_ld128(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c2__ssse3_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-ssse3-ld64.c b/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-ssse3-ld64.c
index c2aec7c..49afa05 100644
--- a/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-ssse3-ld64.c
+++ b/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-ssse3-ld64.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_2x4c2__ssse3_ld64(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c2__ssse3_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-xop-ld128.c b/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-xop-ld128.c
index 6266b33..6ff8eaf 100644
--- a/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-xop-ld128.c
+++ b/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-xop-ld128.c
@@ -20,7 +20,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld128(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c2__xop_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-xop-ld64.c b/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-xop-ld64.c
index a4f12a4..864db27 100644
--- a/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-xop-ld64.c
+++ b/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-xop-ld64.c
@@ -20,7 +20,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld64(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c2__xop_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-avx.c b/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-avx.c
index dad6b55..5b70a39 100644
--- a/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-avx.c
+++ b/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-avx.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_xw_minmax_ukernel_2x4c2__avx(
+void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c2__avx(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-sse2.c b/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-sse2.c
index 4beb018..3391f82 100644
--- a/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-sse2.c
+++ b/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-sse2.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_xw_minmax_ukernel_2x4c2__sse2(
+void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c2__sse2(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-sse41.c b/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-sse41.c
index a1575e9..2a7e69e 100644
--- a/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-sse41.c
+++ b/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-sse41.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_xw_minmax_ukernel_2x4c2__sse41(
+void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c2__sse41(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-ssse3.c b/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-ssse3.c
index d51e490..4edb2ab 100644
--- a/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-ssse3.c
+++ b/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-ssse3.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_xw_minmax_ukernel_2x4c2__ssse3(
+void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c2__ssse3(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-xop.c b/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-xop.c
index 19dcda2..f202ba3 100644
--- a/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-xop.c
+++ b/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-xop.c
@@ -20,7 +20,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_xw_minmax_ukernel_2x4c2__xop(
+void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c2__xop(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-avx-ld128.c b/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-avx-ld128.c
index fc48e65..ebe7be8 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-avx-ld128.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-avx-ld128.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld128(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c8__avx_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-avx-ld64.c b/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-avx-ld64.c
index 713a705..1224615 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-avx-ld64.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-avx-ld64.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld64(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c8__avx_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-sse2-ld128.c b/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-sse2-ld128.c
index 978e912..41ae006 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-sse2-ld128.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-sse2-ld128.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_2x4c8__sse2_ld128(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c8__sse2_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-sse2-ld64.c b/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-sse2-ld64.c
index 8d7725e..eed4666 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-sse2-ld64.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-sse2-ld64.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_2x4c8__sse2_ld64(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c8__sse2_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-sse41-ld128.c b/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-sse41-ld128.c
index 6236921..1efa46e 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-sse41-ld128.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-sse41-ld128.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld128(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c8__sse41_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-sse41-ld64.c b/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-sse41-ld64.c
index d9db58d..67248e0 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-sse41-ld64.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-sse41-ld64.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld64(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c8__sse41_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-ssse3-ld128.c b/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-ssse3-ld128.c
index 26c49c9..4edb5ad 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-ssse3-ld128.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-ssse3-ld128.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_2x4c8__ssse3_ld128(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c8__ssse3_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-ssse3-ld64.c b/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-ssse3-ld64.c
index 163a9f8..40cc948 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-ssse3-ld64.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-ssse3-ld64.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_2x4c8__ssse3_ld64(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c8__ssse3_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-wasmsimd-ld128.c b/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-wasmsimd-ld128.c
index a48f0c7..e3dee05 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-wasmsimd-ld128.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-wasmsimd-ld128.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_2x4c8__wasmsimd_ld128(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c8__wasmsimd_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-wasmsimd-ld64.c b/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-wasmsimd-ld64.c
index 2c37105..3996eaa 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-wasmsimd-ld64.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-wasmsimd-ld64.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_2x4c8__wasmsimd_ld64(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c8__wasmsimd_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-xop-ld128.c b/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-xop-ld128.c
index 8f2ec5e..160f770 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-xop-ld128.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-xop-ld128.c
@@ -20,7 +20,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld128(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c8__xop_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-xop-ld64.c b/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-xop-ld64.c
index 16bbdcc..f240a7e 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-xop-ld64.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-xop-ld64.c
@@ -20,7 +20,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld64(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c8__xop_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-avx.c b/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-avx.c
index 3a1a964..db3d8c3 100644
--- a/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-avx.c
+++ b/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-avx.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__avx(
+void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__avx(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-sse2.c b/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-sse2.c
index 7764dd8..0f2e070 100644
--- a/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-sse2.c
+++ b/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-sse2.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__sse2(
+void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse2(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-sse41.c b/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-sse41.c
index 1903f6b..ff3ec8f 100644
--- a/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-sse41.c
+++ b/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-sse41.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__sse41(
+void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse41(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-ssse3.c b/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-ssse3.c
index bde1b65..771050c 100644
--- a/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-ssse3.c
+++ b/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-ssse3.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__ssse3(
+void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__ssse3(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-wasmsimd.c b/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-wasmsimd.c
index e9444f8..4b866f1 100644
--- a/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-wasmsimd.c
+++ b/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-wasmsimd.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__wasmsimd(
+void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__wasmsimd(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-xop.c b/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-xop.c
index b15a8f9..60defec 100644
--- a/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-xop.c
+++ b/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-xop.c
@@ -20,7 +20,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__xop(
+void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__xop(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/2x8-minmax-gemmlowp-neon-mlal-lane-prfm.c b/src/qs8-gemm/gen/2x8-minmax-gemmlowp-neon-mlal-lane-prfm.c
index 35fa5c9..3e70e46 100644
--- a/src/qs8-gemm/gen/2x8-minmax-gemmlowp-neon-mlal-lane-prfm.c
+++ b/src/qs8-gemm/gen/2x8-minmax-gemmlowp-neon-mlal-lane-prfm.c
@@ -15,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_2x8__neon_mlal_lane_prfm(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8__neon_mlal_lane_prfm(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/2x8-minmax-gemmlowp-neon-mlal-lane.c b/src/qs8-gemm/gen/2x8-minmax-gemmlowp-neon-mlal-lane.c
index 296571f..c24f9af 100644
--- a/src/qs8-gemm/gen/2x8-minmax-gemmlowp-neon-mlal-lane.c
+++ b/src/qs8-gemm/gen/2x8-minmax-gemmlowp-neon-mlal-lane.c
@@ -15,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_2x8__neon_mlal_lane(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8__neon_mlal_lane(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/2x8-minmax-gemmlowp-neon-mull-addw-dup.c b/src/qs8-gemm/gen/2x8-minmax-gemmlowp-neon-mull-addw-dup.c
index b5a7b2f..2938a65 100644
--- a/src/qs8-gemm/gen/2x8-minmax-gemmlowp-neon-mull-addw-dup.c
+++ b/src/qs8-gemm/gen/2x8-minmax-gemmlowp-neon-mull-addw-dup.c
@@ -15,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_2x8__neon_mull_addw_dup(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8__neon_mull_addw_dup(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/2x8c16-minmax-gemmlowp-neon-mlal-padal.c b/src/qs8-gemm/gen/2x8c16-minmax-gemmlowp-neon-mlal-padal.c
index 21b3acc..34a5978 100644
--- a/src/qs8-gemm/gen/2x8c16-minmax-gemmlowp-neon-mlal-padal.c
+++ b/src/qs8-gemm/gen/2x8c16-minmax-gemmlowp-neon-mlal-padal.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_2x8c16__neon_mlal_padal(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal_padal(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/2x8c2-minmax-gemmlowp-neon-mlal-padal-dup.c b/src/qs8-gemm/gen/2x8c2-minmax-gemmlowp-neon-mlal-padal-dup.c
index 530d297..3d4e374 100644
--- a/src/qs8-gemm/gen/2x8c2-minmax-gemmlowp-neon-mlal-padal-dup.c
+++ b/src/qs8-gemm/gen/2x8c2-minmax-gemmlowp-neon-mlal-padal-dup.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_padal_dup(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/2x8c2-minmax-gemmlowp-neon-mull-padal-dup.c b/src/qs8-gemm/gen/2x8c2-minmax-gemmlowp-neon-mull-padal-dup.c
index 83ab0d1..3ba6953 100644
--- a/src/qs8-gemm/gen/2x8c2-minmax-gemmlowp-neon-mull-padal-dup.c
+++ b/src/qs8-gemm/gen/2x8c2-minmax-gemmlowp-neon-mull-padal-dup.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mull_padal_dup(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c2__neon_mull_padal_dup(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-cortex-a53.S b/src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-cortex-a53.S
index a9e2aa4..3dba71d 100644
--- a/src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-cortex-a53.S
+++ b/src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-cortex-a53.S
@@ -9,7 +9,7 @@
 
 #include <xnnpack/assembly.h>
 
-# void xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53(
+# void xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53(
 #     size_t mr,                 x0
 #     size_t nc,                 x1
 #     size_t kc,                 x2 / x0
@@ -34,7 +34,7 @@
 # x16, x17, x20, x21 tenporary a53 gpr load data
 
 
-BEGIN_FUNCTION xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53
+BEGIN_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53
 
         # Clamp A and C pointers
         CMP     x0, 2                   // if mr < 2
@@ -402,7 +402,7 @@
         LDP     d8, d9, [sp], 80
         RET
 
-END_FUNCTION xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53
+END_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53
 
 #ifdef __ELF__
 .section ".note.GNU-stack","",%progbits
diff --git a/src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm-cortex-a53.S b/src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm-cortex-a53.S
index 1470573..caa2e3b 100644
--- a/src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm-cortex-a53.S
+++ b/src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm-cortex-a53.S
@@ -9,7 +9,7 @@
 
 #include <xnnpack/assembly.h>
 
-# void xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53(
+# void xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53(
 #     size_t mr,                 x0
 #     size_t nc,                 x1
 #     size_t kc,                 x2 / x0
@@ -34,7 +34,7 @@
 # x16, x17, x20, x21 tenporary a53 gpr load data
 
 
-BEGIN_FUNCTION xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53
+BEGIN_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53
 
         # Clamp A and C pointers
         CMP     x0, 2                   // if mr < 2
@@ -406,7 +406,7 @@
         LDP     d8, d9, [sp], 80
         RET
 
-END_FUNCTION xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53
+END_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53
 
 #ifdef __ELF__
 .section ".note.GNU-stack","",%progbits
diff --git a/src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm.S b/src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm.S
index 661ae54..e5be317 100644
--- a/src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm.S
+++ b/src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm.S
@@ -9,7 +9,7 @@
 
 #include <xnnpack/assembly.h>
 
-# void xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal(
+# void xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal(
 #     size_t mr,                 x0
 #     size_t nc,                 x1
 #     size_t kc,                 x2 / x0
@@ -33,7 +33,7 @@
 # temp1   v3 v11 v13 v15
 
 
-BEGIN_FUNCTION xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm
+BEGIN_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm
 
         # Clamp A and C pointers
         CMP     x0, 2                   // if mr < 2
@@ -353,7 +353,7 @@
         LDP     d8, d9, [sp], 64
         RET
 
-END_FUNCTION xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm
+END_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm
 
 #ifdef __ELF__
 .section ".note.GNU-stack","",%progbits
diff --git a/src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal.S b/src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal.S
index f6df72a..6b0e996 100644
--- a/src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal.S
+++ b/src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal.S
@@ -9,7 +9,7 @@
 
 #include <xnnpack/assembly.h>
 
-# void xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal(
+# void xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal(
 #     size_t mr,                 x0
 #     size_t nc,                 x1
 #     size_t kc,                 x2 / x0
@@ -33,7 +33,7 @@
 # temp1   v3 v11 v13 v15
 
 
-BEGIN_FUNCTION xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal
+BEGIN_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal
 
         # Clamp A and C pointers
         CMP     x0, 2                   // if mr < 2
@@ -349,7 +349,7 @@
         LDP     d8, d9, [sp], 64
         RET
 
-END_FUNCTION xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal
+END_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal
 
 #ifdef __ELF__
 .section ".note.GNU-stack","",%progbits
diff --git a/src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-avx2.c b/src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-avx2.c
index be11da6..be9fc9a 100644
--- a/src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-avx2.c
+++ b/src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-avx2.c
@@ -16,7 +16,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__avx2(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-neon-mlal-padal.c b/src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-neon-mlal-padal.c
index 19f7b55..9660125 100644
--- a/src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-neon-mlal-padal.c
+++ b/src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-neon-mlal-padal.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal_padal(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-neon-mull-padal.c b/src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-neon-mull-padal.c
index 41b37d7..d840aba 100644
--- a/src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-neon-mull-padal.c
+++ b/src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-neon-mull-padal.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mull_padal(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull_padal(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/2x8c8-xw-minmax-gemmlowp-avx2.c b/src/qs8-gemm/gen/2x8c8-xw-minmax-gemmlowp-avx2.c
index f4e707b..732835b 100644
--- a/src/qs8-gemm/gen/2x8c8-xw-minmax-gemmlowp-avx2.c
+++ b/src/qs8-gemm/gen/2x8c8-xw-minmax-gemmlowp-avx2.c
@@ -16,7 +16,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2(
+void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x8c8__avx2(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/3x16-minmax-gemmlowp-neon-mlal-lane-prfm.c b/src/qs8-gemm/gen/3x16-minmax-gemmlowp-neon-mlal-lane-prfm.c
index 0fb5e2e..b16dcd3 100644
--- a/src/qs8-gemm/gen/3x16-minmax-gemmlowp-neon-mlal-lane-prfm.c
+++ b/src/qs8-gemm/gen/3x16-minmax-gemmlowp-neon-mlal-lane-prfm.c
@@ -15,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane_prfm(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16__neon_mlal_lane_prfm(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/3x16-minmax-gemmlowp-neon-mlal-lane.c b/src/qs8-gemm/gen/3x16-minmax-gemmlowp-neon-mlal-lane.c
index 9c498fc..4e234e0 100644
--- a/src/qs8-gemm/gen/3x16-minmax-gemmlowp-neon-mlal-lane.c
+++ b/src/qs8-gemm/gen/3x16-minmax-gemmlowp-neon-mlal-lane.c
@@ -15,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16__neon_mlal_lane(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/3x16-minmax-gemmlowp-neon-mull-addw-dup.c b/src/qs8-gemm/gen/3x16-minmax-gemmlowp-neon-mull-addw-dup.c
index 84b77b5..e1b0ad0 100644
--- a/src/qs8-gemm/gen/3x16-minmax-gemmlowp-neon-mull-addw-dup.c
+++ b/src/qs8-gemm/gen/3x16-minmax-gemmlowp-neon-mull-addw-dup.c
@@ -15,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16__neon_mull_addw_dup(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/3x16c16-minmax-gemmlowp-neon-mlal-padal.c b/src/qs8-gemm/gen/3x16c16-minmax-gemmlowp-neon-mlal-padal.c
index 5bd1d85..2a9e2cf 100644
--- a/src/qs8-gemm/gen/3x16c16-minmax-gemmlowp-neon-mlal-padal.c
+++ b/src/qs8-gemm/gen/3x16c16-minmax-gemmlowp-neon-mlal-padal.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal_padal(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/3x16c2-minmax-gemmlowp-neon-mlal-padal-dup.c b/src/qs8-gemm/gen/3x16c2-minmax-gemmlowp-neon-mlal-padal-dup.c
index 782bb6b..259cc2e 100644
--- a/src/qs8-gemm/gen/3x16c2-minmax-gemmlowp-neon-mlal-padal-dup.c
+++ b/src/qs8-gemm/gen/3x16c2-minmax-gemmlowp-neon-mlal-padal-dup.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c2__neon_mlal_padal_dup(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/3x16c2-minmax-gemmlowp-neon-mull-padal-dup.c b/src/qs8-gemm/gen/3x16c2-minmax-gemmlowp-neon-mull-padal-dup.c
index f5b5951..9912589 100644
--- a/src/qs8-gemm/gen/3x16c2-minmax-gemmlowp-neon-mull-padal-dup.c
+++ b/src/qs8-gemm/gen/3x16c2-minmax-gemmlowp-neon-mull-padal-dup.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c2__neon_mull_padal_dup(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/3x16c8-minmax-gemmlowp-avx512skx.c b/src/qs8-gemm/gen/3x16c8-minmax-gemmlowp-avx512skx.c
index b6d6a6d..40903b2 100644
--- a/src/qs8-gemm/gen/3x16c8-minmax-gemmlowp-avx512skx.c
+++ b/src/qs8-gemm/gen/3x16c8-minmax-gemmlowp-avx512skx.c
@@ -16,7 +16,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_3x16c8__avx512skx(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__avx512skx(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/3x16c8-minmax-gemmlowp-neon-mlal-padal.c b/src/qs8-gemm/gen/3x16c8-minmax-gemmlowp-neon-mlal-padal.c
index 9b457f2..03f31f0 100644
--- a/src/qs8-gemm/gen/3x16c8-minmax-gemmlowp-neon-mlal-padal.c
+++ b/src/qs8-gemm/gen/3x16c8-minmax-gemmlowp-neon-mlal-padal.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal_padal(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/3x16c8-minmax-gemmlowp-neon-mull-padal.c b/src/qs8-gemm/gen/3x16c8-minmax-gemmlowp-neon-mull-padal.c
index 77450dd..93aae3c 100644
--- a/src/qs8-gemm/gen/3x16c8-minmax-gemmlowp-neon-mull-padal.c
+++ b/src/qs8-gemm/gen/3x16c8-minmax-gemmlowp-neon-mull-padal.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull_padal(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/3x2-minmax-gemmlowp-scalar.c b/src/qs8-gemm/gen/3x2-minmax-gemmlowp-scalar.c
index 04253c0..461568a 100644
--- a/src/qs8-gemm/gen/3x2-minmax-gemmlowp-scalar.c
+++ b/src/qs8-gemm/gen/3x2-minmax-gemmlowp-scalar.c
@@ -14,7 +14,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_3x2__scalar(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x2__scalar(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/3x4-minmax-gemmlowp-scalar.c b/src/qs8-gemm/gen/3x4-minmax-gemmlowp-scalar.c
index 30a6d51..09eb67a 100644
--- a/src/qs8-gemm/gen/3x4-minmax-gemmlowp-scalar.c
+++ b/src/qs8-gemm/gen/3x4-minmax-gemmlowp-scalar.c
@@ -14,7 +14,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_3x4__scalar(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4__scalar(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-avx-ld128.c b/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-avx-ld128.c
index 06a9192..2010ae1 100644
--- a/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-avx-ld128.c
+++ b/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-avx-ld128.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld128(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c2__avx_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-avx-ld64.c b/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-avx-ld64.c
index cff57f6..31002b2 100644
--- a/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-avx-ld64.c
+++ b/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-avx-ld64.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld64(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c2__avx_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-sse2-ld128.c b/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-sse2-ld128.c
index a3e6c9f..f6b4b10 100644
--- a/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-sse2-ld128.c
+++ b/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-sse2-ld128.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_3x4c2__sse2_ld128(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c2__sse2_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-sse2-ld64.c b/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-sse2-ld64.c
index a6a69b4..629aa5f 100644
--- a/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-sse2-ld64.c
+++ b/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-sse2-ld64.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_3x4c2__sse2_ld64(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c2__sse2_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-sse41-ld128.c b/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-sse41-ld128.c
index 396470b..bd727a6 100644
--- a/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-sse41-ld128.c
+++ b/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-sse41-ld128.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld128(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c2__sse41_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-sse41-ld64.c b/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-sse41-ld64.c
index 227d609..b379ade 100644
--- a/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-sse41-ld64.c
+++ b/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-sse41-ld64.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld64(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c2__sse41_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-ssse3-ld128.c b/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-ssse3-ld128.c
index 306b6e5..762c63d 100644
--- a/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-ssse3-ld128.c
+++ b/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-ssse3-ld128.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_3x4c2__ssse3_ld128(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c2__ssse3_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-ssse3-ld64.c b/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-ssse3-ld64.c
index 664fd04..09db9ec 100644
--- a/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-ssse3-ld64.c
+++ b/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-ssse3-ld64.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_3x4c2__ssse3_ld64(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c2__ssse3_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-xop-ld128.c b/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-xop-ld128.c
index f599282..17c0ea1 100644
--- a/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-xop-ld128.c
+++ b/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-xop-ld128.c
@@ -20,7 +20,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld128(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c2__xop_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-xop-ld64.c b/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-xop-ld64.c
index c7a4ceb..b05257b 100644
--- a/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-xop-ld64.c
+++ b/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-xop-ld64.c
@@ -20,7 +20,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld64(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c2__xop_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-avx.c b/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-avx.c
index 0f5d9b1..29abd40 100644
--- a/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-avx.c
+++ b/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-avx.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_xw_minmax_ukernel_3x4c2__avx(
+void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c2__avx(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-sse2.c b/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-sse2.c
index 0b0c916..a54a94f 100644
--- a/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-sse2.c
+++ b/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-sse2.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_xw_minmax_ukernel_3x4c2__sse2(
+void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c2__sse2(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-sse41.c b/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-sse41.c
index c4b2953..b6f21c4 100644
--- a/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-sse41.c
+++ b/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-sse41.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_xw_minmax_ukernel_3x4c2__sse41(
+void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c2__sse41(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-ssse3.c b/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-ssse3.c
index 4cd3e0d..e44996f 100644
--- a/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-ssse3.c
+++ b/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-ssse3.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_xw_minmax_ukernel_3x4c2__ssse3(
+void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c2__ssse3(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-xop.c b/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-xop.c
index 86d4b3f..6687fef 100644
--- a/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-xop.c
+++ b/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-xop.c
@@ -20,7 +20,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_xw_minmax_ukernel_3x4c2__xop(
+void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c2__xop(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-avx-ld128.c b/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-avx-ld128.c
index a2bf9d9..2b106c2 100644
--- a/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-avx-ld128.c
+++ b/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-avx-ld128.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld128(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c8__avx_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-avx-ld64.c b/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-avx-ld64.c
index 72b3074..96980f4 100644
--- a/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-avx-ld64.c
+++ b/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-avx-ld64.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld64(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c8__avx_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-sse2-ld128.c b/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-sse2-ld128.c
index 0f57e0f..2da3d50 100644
--- a/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-sse2-ld128.c
+++ b/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-sse2-ld128.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_3x4c8__sse2_ld128(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c8__sse2_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-sse2-ld64.c b/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-sse2-ld64.c
index 10fa7cd..55cc4a7 100644
--- a/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-sse2-ld64.c
+++ b/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-sse2-ld64.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_3x4c8__sse2_ld64(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c8__sse2_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-sse41-ld128.c b/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-sse41-ld128.c
index 6d8579c..4a255bc 100644
--- a/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-sse41-ld128.c
+++ b/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-sse41-ld128.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld128(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c8__sse41_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-sse41-ld64.c b/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-sse41-ld64.c
index 62b0ba2..0ddf914 100644
--- a/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-sse41-ld64.c
+++ b/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-sse41-ld64.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld64(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c8__sse41_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-ssse3-ld128.c b/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-ssse3-ld128.c
index 81d0de0..9a8dd50 100644
--- a/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-ssse3-ld128.c
+++ b/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-ssse3-ld128.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_3x4c8__ssse3_ld128(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c8__ssse3_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-ssse3-ld64.c b/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-ssse3-ld64.c
index fa0121b..0857eaf 100644
--- a/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-ssse3-ld64.c
+++ b/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-ssse3-ld64.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_3x4c8__ssse3_ld64(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c8__ssse3_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-wasmsimd-ld128.c b/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-wasmsimd-ld128.c
index 2c11faf..35d7f78 100644
--- a/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-wasmsimd-ld128.c
+++ b/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-wasmsimd-ld128.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_3x4c8__wasmsimd_ld128(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c8__wasmsimd_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-wasmsimd-ld64.c b/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-wasmsimd-ld64.c
index 678e305..26f58fc 100644
--- a/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-wasmsimd-ld64.c
+++ b/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-wasmsimd-ld64.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_3x4c8__wasmsimd_ld64(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c8__wasmsimd_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-xop-ld128.c b/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-xop-ld128.c
index 4a09afe..d840ac7 100644
--- a/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-xop-ld128.c
+++ b/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-xop-ld128.c
@@ -20,7 +20,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld128(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c8__xop_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-xop-ld64.c b/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-xop-ld64.c
index 0c874fc..1bb8e6f 100644
--- a/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-xop-ld64.c
+++ b/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-xop-ld64.c
@@ -20,7 +20,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld64(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c8__xop_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-avx.c b/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-avx.c
index a4192f6..5168a36 100644
--- a/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-avx.c
+++ b/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-avx.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__avx(
+void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__avx(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-sse2.c b/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-sse2.c
index 4d4d116..98bfb01 100644
--- a/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-sse2.c
+++ b/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-sse2.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__sse2(
+void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse2(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-sse41.c b/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-sse41.c
index a00a4ca..b9eda39 100644
--- a/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-sse41.c
+++ b/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-sse41.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__sse41(
+void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse41(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-ssse3.c b/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-ssse3.c
index 13681e3..c402de7 100644
--- a/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-ssse3.c
+++ b/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-ssse3.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__ssse3(
+void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__ssse3(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-wasmsimd.c b/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-wasmsimd.c
index d0df439..5fe2220 100644
--- a/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-wasmsimd.c
+++ b/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-wasmsimd.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__wasmsimd(
+void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__wasmsimd(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-xop.c b/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-xop.c
index e471b8e..2bf623f 100644
--- a/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-xop.c
+++ b/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-xop.c
@@ -20,7 +20,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__xop(
+void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__xop(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/3x8-minmax-gemmlowp-neon-mlal-lane-prfm.c b/src/qs8-gemm/gen/3x8-minmax-gemmlowp-neon-mlal-lane-prfm.c
index 94ddff9..ca03c7b 100644
--- a/src/qs8-gemm/gen/3x8-minmax-gemmlowp-neon-mlal-lane-prfm.c
+++ b/src/qs8-gemm/gen/3x8-minmax-gemmlowp-neon-mlal-lane-prfm.c
@@ -15,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane_prfm(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8__neon_mlal_lane_prfm(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/3x8-minmax-gemmlowp-neon-mlal-lane.c b/src/qs8-gemm/gen/3x8-minmax-gemmlowp-neon-mlal-lane.c
index 6e4a6b4..051e8f2 100644
--- a/src/qs8-gemm/gen/3x8-minmax-gemmlowp-neon-mlal-lane.c
+++ b/src/qs8-gemm/gen/3x8-minmax-gemmlowp-neon-mlal-lane.c
@@ -15,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8__neon_mlal_lane(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/3x8-minmax-gemmlowp-neon-mull-addw-dup.c b/src/qs8-gemm/gen/3x8-minmax-gemmlowp-neon-mull-addw-dup.c
index b66556d..49a11d6 100644
--- a/src/qs8-gemm/gen/3x8-minmax-gemmlowp-neon-mull-addw-dup.c
+++ b/src/qs8-gemm/gen/3x8-minmax-gemmlowp-neon-mull-addw-dup.c
@@ -15,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_3x8__neon_mull_addw_dup(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8__neon_mull_addw_dup(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/3x8c16-minmax-gemmlowp-neon-mlal-padal.c b/src/qs8-gemm/gen/3x8c16-minmax-gemmlowp-neon-mlal-padal.c
index e4ec320..6053eac 100644
--- a/src/qs8-gemm/gen/3x8c16-minmax-gemmlowp-neon-mlal-padal.c
+++ b/src/qs8-gemm/gen/3x8c16-minmax-gemmlowp-neon-mlal-padal.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal_padal(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/3x8c2-minmax-gemmlowp-neon-mlal-padal-dup.c b/src/qs8-gemm/gen/3x8c2-minmax-gemmlowp-neon-mlal-padal-dup.c
index f273682..851629c 100644
--- a/src/qs8-gemm/gen/3x8c2-minmax-gemmlowp-neon-mlal-padal-dup.c
+++ b/src/qs8-gemm/gen/3x8c2-minmax-gemmlowp-neon-mlal-padal-dup.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c2__neon_mlal_padal_dup(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/3x8c2-minmax-gemmlowp-neon-mull-padal-dup.c b/src/qs8-gemm/gen/3x8c2-minmax-gemmlowp-neon-mull-padal-dup.c
index c2510de..8001d85 100644
--- a/src/qs8-gemm/gen/3x8c2-minmax-gemmlowp-neon-mull-padal-dup.c
+++ b/src/qs8-gemm/gen/3x8c2-minmax-gemmlowp-neon-mull-padal-dup.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mull_padal_dup(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c2__neon_mull_padal_dup(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/3x8c8-minmax-gemmlowp-avx2.c b/src/qs8-gemm/gen/3x8c8-minmax-gemmlowp-avx2.c
index d41e047..e87cbef 100644
--- a/src/qs8-gemm/gen/3x8c8-minmax-gemmlowp-avx2.c
+++ b/src/qs8-gemm/gen/3x8c8-minmax-gemmlowp-avx2.c
@@ -16,7 +16,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__avx2(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/3x8c8-minmax-gemmlowp-neon-mlal-padal.c b/src/qs8-gemm/gen/3x8c8-minmax-gemmlowp-neon-mlal-padal.c
index f7cdbe7..faa1eb7 100644
--- a/src/qs8-gemm/gen/3x8c8-minmax-gemmlowp-neon-mlal-padal.c
+++ b/src/qs8-gemm/gen/3x8c8-minmax-gemmlowp-neon-mlal-padal.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal_padal(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/3x8c8-minmax-gemmlowp-neon-mull-padal.c b/src/qs8-gemm/gen/3x8c8-minmax-gemmlowp-neon-mull-padal.c
index 0b7651e..18d17a1 100644
--- a/src/qs8-gemm/gen/3x8c8-minmax-gemmlowp-neon-mull-padal.c
+++ b/src/qs8-gemm/gen/3x8c8-minmax-gemmlowp-neon-mull-padal.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull_padal(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/3x8c8-xw-minmax-gemmlowp-avx2.c b/src/qs8-gemm/gen/3x8c8-xw-minmax-gemmlowp-avx2.c
index d589e3c..f7d5a09 100644
--- a/src/qs8-gemm/gen/3x8c8-xw-minmax-gemmlowp-avx2.c
+++ b/src/qs8-gemm/gen/3x8c8-xw-minmax-gemmlowp-avx2.c
@@ -16,7 +16,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2(
+void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x8c8__avx2(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/4x16-minmax-gemmlowp-aarch64-neon-mlal-lane-cortex-a53.S b/src/qs8-gemm/gen/4x16-minmax-gemmlowp-aarch64-neon-mlal-lane-cortex-a53.S
index 81c990c..0e84934 100644
--- a/src/qs8-gemm/gen/4x16-minmax-gemmlowp-aarch64-neon-mlal-lane-cortex-a53.S
+++ b/src/qs8-gemm/gen/4x16-minmax-gemmlowp-aarch64-neon-mlal-lane-cortex-a53.S
@@ -9,7 +9,7 @@
 
 #include <xnnpack/assembly.h>
 
-# void xnn_qs8_gemm_minmax_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53(
+# void xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53(
 #     size_t mr,                 x0
 #     size_t nc,                 x1
 #     size_t kc,                 x2 / x0
@@ -38,7 +38,7 @@
 
 # x10 x17 a53 temp registers
 
-BEGIN_FUNCTION xnn_qs8_gemm_minmax_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53
+BEGIN_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53
 
         # Clamp A and C pointers
         CMP     x0, 2                   // if mr < 2
@@ -808,7 +808,7 @@
 8:
         RET
 
-END_FUNCTION xnn_qs8_gemm_minmax_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53
+END_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53
 
 #ifdef __ELF__
 .section ".note.GNU-stack","",%progbits
diff --git a/src/qs8-gemm/gen/4x16-minmax-gemmlowp-aarch64-neon-mlal-lane-prfm-cortex-a53.S b/src/qs8-gemm/gen/4x16-minmax-gemmlowp-aarch64-neon-mlal-lane-prfm-cortex-a53.S
index 4940ffe..4250a81 100644
--- a/src/qs8-gemm/gen/4x16-minmax-gemmlowp-aarch64-neon-mlal-lane-prfm-cortex-a53.S
+++ b/src/qs8-gemm/gen/4x16-minmax-gemmlowp-aarch64-neon-mlal-lane-prfm-cortex-a53.S
@@ -9,7 +9,7 @@
 
 #include <xnnpack/assembly.h>
 
-# void xnn_qs8_gemm_minmax_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53(
+# void xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53(
 #     size_t mr,                 x0
 #     size_t nc,                 x1
 #     size_t kc,                 x2 / x0
@@ -38,7 +38,7 @@
 
 # x10 x17 a53 temp registers
 
-BEGIN_FUNCTION xnn_qs8_gemm_minmax_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53
+BEGIN_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53
 
         # Clamp A and C pointers
         CMP     x0, 2                   // if mr < 2
@@ -814,7 +814,7 @@
 8:
         RET
 
-END_FUNCTION xnn_qs8_gemm_minmax_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53
+END_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53
 
 #ifdef __ELF__
 .section ".note.GNU-stack","",%progbits
diff --git a/src/qs8-gemm/gen/4x16-minmax-gemmlowp-neon-mlal-lane-prfm.c b/src/qs8-gemm/gen/4x16-minmax-gemmlowp-neon-mlal-lane-prfm.c
index c84ae2d..89c6da0 100644
--- a/src/qs8-gemm/gen/4x16-minmax-gemmlowp-neon-mlal-lane-prfm.c
+++ b/src/qs8-gemm/gen/4x16-minmax-gemmlowp-neon-mlal-lane-prfm.c
@@ -15,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane_prfm(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16__neon_mlal_lane_prfm(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/4x16-minmax-gemmlowp-neon-mlal-lane.c b/src/qs8-gemm/gen/4x16-minmax-gemmlowp-neon-mlal-lane.c
index 4dfd5af..6311f6b 100644
--- a/src/qs8-gemm/gen/4x16-minmax-gemmlowp-neon-mlal-lane.c
+++ b/src/qs8-gemm/gen/4x16-minmax-gemmlowp-neon-mlal-lane.c
@@ -15,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16__neon_mlal_lane(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/4x16-minmax-gemmlowp-neon-mull-addw-dup.c b/src/qs8-gemm/gen/4x16-minmax-gemmlowp-neon-mull-addw-dup.c
index cd1d411..546e6c0 100644
--- a/src/qs8-gemm/gen/4x16-minmax-gemmlowp-neon-mull-addw-dup.c
+++ b/src/qs8-gemm/gen/4x16-minmax-gemmlowp-neon-mull-addw-dup.c
@@ -15,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16__neon_mull_addw_dup(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/4x16c16-minmax-gemmlowp-neon-mlal-padal.c b/src/qs8-gemm/gen/4x16c16-minmax-gemmlowp-neon-mlal-padal.c
index 9831c12..88a8672 100644
--- a/src/qs8-gemm/gen/4x16c16-minmax-gemmlowp-neon-mlal-padal.c
+++ b/src/qs8-gemm/gen/4x16c16-minmax-gemmlowp-neon-mlal-padal.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal_padal(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/4x16c2-minmax-gemmlowp-neon-mlal-padal-dup.c b/src/qs8-gemm/gen/4x16c2-minmax-gemmlowp-neon-mlal-padal-dup.c
index 1a21b08..045bcc0 100644
--- a/src/qs8-gemm/gen/4x16c2-minmax-gemmlowp-neon-mlal-padal-dup.c
+++ b/src/qs8-gemm/gen/4x16c2-minmax-gemmlowp-neon-mlal-padal-dup.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c2__neon_mlal_padal_dup(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/4x16c2-minmax-gemmlowp-neon-mull-padal-dup.c b/src/qs8-gemm/gen/4x16c2-minmax-gemmlowp-neon-mull-padal-dup.c
index 8f6c1a2..1e2eb9a 100644
--- a/src/qs8-gemm/gen/4x16c2-minmax-gemmlowp-neon-mull-padal-dup.c
+++ b/src/qs8-gemm/gen/4x16c2-minmax-gemmlowp-neon-mull-padal-dup.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c2__neon_mull_padal_dup(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/4x16c4-minmax-gemmlowp-neondot.c b/src/qs8-gemm/gen/4x16c4-minmax-gemmlowp-neondot.c
index 2744fe9..d62c105 100644
--- a/src/qs8-gemm/gen/4x16c4-minmax-gemmlowp-neondot.c
+++ b/src/qs8-gemm/gen/4x16c4-minmax-gemmlowp-neondot.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__neondot(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/4x16c8-minmax-gemmlowp-avx512skx.c b/src/qs8-gemm/gen/4x16c8-minmax-gemmlowp-avx512skx.c
index 2a7b6ca..d3cea1a 100644
--- a/src/qs8-gemm/gen/4x16c8-minmax-gemmlowp-avx512skx.c
+++ b/src/qs8-gemm/gen/4x16c8-minmax-gemmlowp-avx512skx.c
@@ -16,7 +16,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__avx512skx(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/4x16c8-minmax-gemmlowp-neon-mlal-padal.c b/src/qs8-gemm/gen/4x16c8-minmax-gemmlowp-neon-mlal-padal.c
index 2f5cbb0..1768564 100644
--- a/src/qs8-gemm/gen/4x16c8-minmax-gemmlowp-neon-mlal-padal.c
+++ b/src/qs8-gemm/gen/4x16c8-minmax-gemmlowp-neon-mlal-padal.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal_padal(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/4x16c8-minmax-gemmlowp-neon-mull-padal.c b/src/qs8-gemm/gen/4x16c8-minmax-gemmlowp-neon-mull-padal.c
index 12294e8..f64cfb1 100644
--- a/src/qs8-gemm/gen/4x16c8-minmax-gemmlowp-neon-mull-padal.c
+++ b/src/qs8-gemm/gen/4x16c8-minmax-gemmlowp-neon-mull-padal.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull_padal(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/4x2-minmax-gemmlowp-scalar.c b/src/qs8-gemm/gen/4x2-minmax-gemmlowp-scalar.c
index 5d9c3ae..e2050cd 100644
--- a/src/qs8-gemm/gen/4x2-minmax-gemmlowp-scalar.c
+++ b/src/qs8-gemm/gen/4x2-minmax-gemmlowp-scalar.c
@@ -14,7 +14,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_4x2__scalar(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x2__scalar(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/4x4-minmax-gemmlowp-scalar.c b/src/qs8-gemm/gen/4x4-minmax-gemmlowp-scalar.c
index 725583d..35cf43c 100644
--- a/src/qs8-gemm/gen/4x4-minmax-gemmlowp-scalar.c
+++ b/src/qs8-gemm/gen/4x4-minmax-gemmlowp-scalar.c
@@ -14,7 +14,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_4x4__scalar(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x4__scalar(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-avx-ld128.c b/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-avx-ld128.c
index d3c0a28..0f19279 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-avx-ld128.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-avx-ld128.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld128(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x4c2__avx_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-avx-ld64.c b/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-avx-ld64.c
index 43fdc24..83572e9 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-avx-ld64.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-avx-ld64.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld64(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x4c2__avx_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-sse2-ld128.c b/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-sse2-ld128.c
index 20a69e6..daff859 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-sse2-ld128.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-sse2-ld128.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_4x4c2__sse2_ld128(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x4c2__sse2_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-sse2-ld64.c b/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-sse2-ld64.c
index aed4404..5ab288b 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-sse2-ld64.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-sse2-ld64.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_4x4c2__sse2_ld64(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x4c2__sse2_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-sse41-ld128.c b/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-sse41-ld128.c
index 31a3cda..180fabe 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-sse41-ld128.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-sse41-ld128.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x4c2__sse41_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-sse41-ld64.c b/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-sse41-ld64.c
index b2cf9d6..1bb000d 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-sse41-ld64.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-sse41-ld64.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x4c2__sse41_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-ssse3-ld128.c b/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-ssse3-ld128.c
index c1a756e..94f18c0 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-ssse3-ld128.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-ssse3-ld128.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_4x4c2__ssse3_ld128(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x4c2__ssse3_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-ssse3-ld64.c b/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-ssse3-ld64.c
index d1b058d..757c664 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-ssse3-ld64.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-ssse3-ld64.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_4x4c2__ssse3_ld64(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x4c2__ssse3_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-xop-ld128.c b/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-xop-ld128.c
index b1b2b86..885d0f6 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-xop-ld128.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-xop-ld128.c
@@ -20,7 +20,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x4c2__xop_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-xop-ld64.c b/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-xop-ld64.c
index 1bf5524..ff09bbe 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-xop-ld64.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-xop-ld64.c
@@ -20,7 +20,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x4c2__xop_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-avx.c b/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-avx.c
index 8179aeb..0ff4f2d 100644
--- a/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-avx.c
+++ b/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-avx.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__avx(
+void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__avx(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-sse2.c b/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-sse2.c
index 60593d2..6fad423 100644
--- a/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-sse2.c
+++ b/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-sse2.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse2(
+void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse2(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-sse41.c b/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-sse41.c
index ca5a399..b59daa4 100644
--- a/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-sse41.c
+++ b/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-sse41.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41(
+void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse41(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-ssse3.c b/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-ssse3.c
index a2cd7d0..a56c299 100644
--- a/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-ssse3.c
+++ b/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-ssse3.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__ssse3(
+void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__ssse3(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-xop.c b/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-xop.c
index 699237d..99ea366 100644
--- a/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-xop.c
+++ b/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-xop.c
@@ -20,7 +20,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop(
+void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__xop(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/4x8-minmax-gemmlowp-neon-mlal-lane-prfm.c b/src/qs8-gemm/gen/4x8-minmax-gemmlowp-neon-mlal-lane-prfm.c
index 2a00c63..427252d 100644
--- a/src/qs8-gemm/gen/4x8-minmax-gemmlowp-neon-mlal-lane-prfm.c
+++ b/src/qs8-gemm/gen/4x8-minmax-gemmlowp-neon-mlal-lane-prfm.c
@@ -15,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane_prfm(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8__neon_mlal_lane_prfm(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/4x8-minmax-gemmlowp-neon-mlal-lane.c b/src/qs8-gemm/gen/4x8-minmax-gemmlowp-neon-mlal-lane.c
index 6742b5c..04c1e2b 100644
--- a/src/qs8-gemm/gen/4x8-minmax-gemmlowp-neon-mlal-lane.c
+++ b/src/qs8-gemm/gen/4x8-minmax-gemmlowp-neon-mlal-lane.c
@@ -15,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8__neon_mlal_lane(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/4x8-minmax-gemmlowp-neon-mull-addw-dup.c b/src/qs8-gemm/gen/4x8-minmax-gemmlowp-neon-mull-addw-dup.c
index 23b5f6f..abb9621 100644
--- a/src/qs8-gemm/gen/4x8-minmax-gemmlowp-neon-mull-addw-dup.c
+++ b/src/qs8-gemm/gen/4x8-minmax-gemmlowp-neon-mull-addw-dup.c
@@ -15,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_4x8__neon_mull_addw_dup(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8__neon_mull_addw_dup(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/4x8c16-minmax-gemmlowp-neon-mlal-padal.c b/src/qs8-gemm/gen/4x8c16-minmax-gemmlowp-neon-mlal-padal.c
index 8fc445d..5245b23 100644
--- a/src/qs8-gemm/gen/4x8c16-minmax-gemmlowp-neon-mlal-padal.c
+++ b/src/qs8-gemm/gen/4x8c16-minmax-gemmlowp-neon-mlal-padal.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal_padal(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/4x8c2-minmax-gemmlowp-neon-mlal-padal-dup.c b/src/qs8-gemm/gen/4x8c2-minmax-gemmlowp-neon-mlal-padal-dup.c
index fa2daf0..d9f855b 100644
--- a/src/qs8-gemm/gen/4x8c2-minmax-gemmlowp-neon-mlal-padal-dup.c
+++ b/src/qs8-gemm/gen/4x8c2-minmax-gemmlowp-neon-mlal-padal-dup.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c2__neon_mlal_padal_dup(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/4x8c2-minmax-gemmlowp-neon-mull-padal-dup.c b/src/qs8-gemm/gen/4x8c2-minmax-gemmlowp-neon-mull-padal-dup.c
index 879fdc7..8a81abc 100644
--- a/src/qs8-gemm/gen/4x8c2-minmax-gemmlowp-neon-mull-padal-dup.c
+++ b/src/qs8-gemm/gen/4x8c2-minmax-gemmlowp-neon-mull-padal-dup.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mull_padal_dup(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c2__neon_mull_padal_dup(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/4x8c4-minmax-gemmlowp-neondot.c b/src/qs8-gemm/gen/4x8c4-minmax-gemmlowp-neondot.c
index b609be0..b31b68e 100644
--- a/src/qs8-gemm/gen/4x8c4-minmax-gemmlowp-neondot.c
+++ b/src/qs8-gemm/gen/4x8c4-minmax-gemmlowp-neondot.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_4x8c4__neondot(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c4__neondot(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/4x8c8-minmax-gemmlowp-neon-mlal-padal.c b/src/qs8-gemm/gen/4x8c8-minmax-gemmlowp-neon-mlal-padal.c
index 96764f3..bd2e83e 100644
--- a/src/qs8-gemm/gen/4x8c8-minmax-gemmlowp-neon-mlal-padal.c
+++ b/src/qs8-gemm/gen/4x8c8-minmax-gemmlowp-neon-mlal-padal.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal_padal(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/4x8c8-minmax-gemmlowp-neon-mull-padal.c b/src/qs8-gemm/gen/4x8c8-minmax-gemmlowp-neon-mull-padal.c
index 520ab4a..e37c3a1 100644
--- a/src/qs8-gemm/gen/4x8c8-minmax-gemmlowp-neon-mull-padal.c
+++ b/src/qs8-gemm/gen/4x8c8-minmax-gemmlowp-neon-mull-padal.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mull_padal(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull_padal(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/6x16-minmax-gemmlowp-neon-mlal-lane-prfm.c b/src/qs8-gemm/gen/6x16-minmax-gemmlowp-neon-mlal-lane-prfm.c
index f8d3267..8218751 100644
--- a/src/qs8-gemm/gen/6x16-minmax-gemmlowp-neon-mlal-lane-prfm.c
+++ b/src/qs8-gemm/gen/6x16-minmax-gemmlowp-neon-mlal-lane-prfm.c
@@ -15,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_6x16__neon_mlal_lane_prfm(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_6x16__neon_mlal_lane_prfm(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/6x16-minmax-gemmlowp-neon-mlal-lane.c b/src/qs8-gemm/gen/6x16-minmax-gemmlowp-neon-mlal-lane.c
index 00e6d40..db38867 100644
--- a/src/qs8-gemm/gen/6x16-minmax-gemmlowp-neon-mlal-lane.c
+++ b/src/qs8-gemm/gen/6x16-minmax-gemmlowp-neon-mlal-lane.c
@@ -15,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_6x16__neon_mlal_lane(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_6x16__neon_mlal_lane(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/6x16c4-minmax-gemmlowp-neondot.c b/src/qs8-gemm/gen/6x16c4-minmax-gemmlowp-neondot.c
index 1bd39bc..5dddcb2 100644
--- a/src/qs8-gemm/gen/6x16c4-minmax-gemmlowp-neondot.c
+++ b/src/qs8-gemm/gen/6x16c4-minmax-gemmlowp-neondot.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_6x16c4__neondot(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/6x8-minmax-gemmlowp-neon-mlal-lane-prfm.c b/src/qs8-gemm/gen/6x8-minmax-gemmlowp-neon-mlal-lane-prfm.c
index 8a4308a..405d6d0 100644
--- a/src/qs8-gemm/gen/6x8-minmax-gemmlowp-neon-mlal-lane-prfm.c
+++ b/src/qs8-gemm/gen/6x8-minmax-gemmlowp-neon-mlal-lane-prfm.c
@@ -15,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_6x8__neon_mlal_lane_prfm(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_6x8__neon_mlal_lane_prfm(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/6x8-minmax-gemmlowp-neon-mlal-lane.c b/src/qs8-gemm/gen/6x8-minmax-gemmlowp-neon-mlal-lane.c
index 63d1f02..577417f 100644
--- a/src/qs8-gemm/gen/6x8-minmax-gemmlowp-neon-mlal-lane.c
+++ b/src/qs8-gemm/gen/6x8-minmax-gemmlowp-neon-mlal-lane.c
@@ -15,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_6x8__neon_mlal_lane(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_6x8__neon_mlal_lane(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/6x8c4-minmax-gemmlowp-neondot.c b/src/qs8-gemm/gen/6x8c4-minmax-gemmlowp-neondot.c
index 489e824..7466ac0 100644
--- a/src/qs8-gemm/gen/6x8c4-minmax-gemmlowp-neondot.c
+++ b/src/qs8-gemm/gen/6x8c4-minmax-gemmlowp-neondot.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_6x8c4__neondot(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_6x8c4__neondot(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/8x16c4-minmax-gemmlowp-neondot.c b/src/qs8-gemm/gen/8x16c4-minmax-gemmlowp-neondot.c
index 88d722e..cd39d25 100644
--- a/src/qs8-gemm/gen/8x16c4-minmax-gemmlowp-neondot.c
+++ b/src/qs8-gemm/gen/8x16c4-minmax-gemmlowp-neondot.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_8x16c4__neondot(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/gen/8x8c4-minmax-gemmlowp-neondot.c b/src/qs8-gemm/gen/8x8c4-minmax-gemmlowp-neondot.c
index 47375be..fdbc6e3 100644
--- a/src/qs8-gemm/gen/8x8c4-minmax-gemmlowp-neondot.c
+++ b/src/qs8-gemm/gen/8x8c4-minmax-gemmlowp-neondot.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_8x8c4__neondot(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_8x8c4__neondot(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/neon-mlal-lane.c.in b/src/qs8-gemm/neon-mlal-lane.c.in
index 0d9bf20..04e952c 100644
--- a/src/qs8-gemm/neon-mlal-lane.c.in
+++ b/src/qs8-gemm/neon-mlal-lane.c.in
@@ -14,7 +14,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_${MR}x${NR}__neon_mlal_lane${"_prfm" if PREFETCH else ""}(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_${MR}x${NR}__neon_mlal_lane${"_prfm" if PREFETCH else ""}(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/neon-mull-addw-dup.c.in b/src/qs8-gemm/neon-mull-addw-dup.c.in
index 3d38ef2..0508020 100644
--- a/src/qs8-gemm/neon-mull-addw-dup.c.in
+++ b/src/qs8-gemm/neon-mull-addw-dup.c.in
@@ -14,7 +14,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_${MR}x${NR}__neon_mull_addw_dup(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_${MR}x${NR}__neon_mull_addw_dup(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-gemm/scalar.c.in b/src/qs8-gemm/scalar.c.in
index 314ff76..e94033f 100644
--- a/src/qs8-gemm/scalar.c.in
+++ b/src/qs8-gemm/scalar.c.in
@@ -10,7 +10,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_qs8_gemm_minmax_ukernel_${MR}x${NR}__scalar(
+void xnn_qs8_gemm_minmax_gemmlowp_ukernel_${MR}x${NR}__scalar(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/1x8c8-aarch64-neon-mlal-padal-cortex-a53.S.in b/src/qs8-igemm/1x8c8-aarch64-neon-mlal-padal-cortex-a53.S.in
index 98df387..9d96ed8 100644
--- a/src/qs8-igemm/1x8c8-aarch64-neon-mlal-padal-cortex-a53.S.in
+++ b/src/qs8-igemm/1x8c8-aarch64-neon-mlal-padal-cortex-a53.S.in
@@ -5,7 +5,7 @@
 
 #include <xnnpack/assembly.h>
 
-# void xnn_qs8_igemm_minmax_ukernel_1x8c8__aarch64_neon_mlal_padal${"_prfm" if PREFETCH else ""}_cortex_a53(
+# void xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal${"_prfm" if PREFETCH else ""}_cortex_a53(
 #     size_t mr,                 x0
 #     size_t nc,                 x1
 #     size_t kc,                 x2 / x0
@@ -29,7 +29,7 @@
 # x16, x17, x7 tenporary a53 gpr load data
 
 
-BEGIN_FUNCTION xnn_qs8_igemm_minmax_ukernel_1x8c8__aarch64_neon_mlal_padal${"_prfm" if PREFETCH else ""}_cortex_a53
+BEGIN_FUNCTION xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal${"_prfm" if PREFETCH else ""}_cortex_a53
 
         # Clamp C pointers
         LDP     x10, x11, [sp]          // Load cn_stride, a_offset
@@ -292,7 +292,7 @@
 9:
         RET
 
-END_FUNCTION xnn_qs8_igemm_minmax_ukernel_1x8c8__aarch64_neon_mlal_padal${"_prfm" if PREFETCH else ""}_cortex_a53
+END_FUNCTION xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal${"_prfm" if PREFETCH else ""}_cortex_a53
 
 #ifdef __ELF__
 .section ".note.GNU-stack","",%progbits
diff --git a/src/qs8-igemm/1x8c8-aarch64-neon-mlal-padal.S.in b/src/qs8-igemm/1x8c8-aarch64-neon-mlal-padal.S.in
index a61da97..c105196 100644
--- a/src/qs8-igemm/1x8c8-aarch64-neon-mlal-padal.S.in
+++ b/src/qs8-igemm/1x8c8-aarch64-neon-mlal-padal.S.in
@@ -5,7 +5,7 @@
 
 #include <xnnpack/assembly.h>
 
-# void xnn_qs8_igemm_minmax_ukernel_1x8c8__aarch64_neon_mlal_padal${"_prfm" if PREFETCH else ""}(
+# void xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal${"_prfm" if PREFETCH else ""}(
 #     size_t mr,                 x0
 #     size_t nc,                 x1
 #     size_t kc,                 x2 / x0
@@ -28,7 +28,7 @@
 # temp0  v17 v19 v21 v23
 
 
-BEGIN_FUNCTION xnn_qs8_igemm_minmax_ukernel_1x8c8__aarch64_neon_mlal_padal${"_prfm" if PREFETCH else ""}
+BEGIN_FUNCTION xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal${"_prfm" if PREFETCH else ""}
 
         # Clamp C pointers
         LDP     x10, x11, [sp]          // Load cn_stride, a_offset
@@ -268,7 +268,7 @@
 9:
         RET
 
-END_FUNCTION xnn_qs8_igemm_minmax_ukernel_1x8c8__aarch64_neon_mlal_padal${"_prfm" if PREFETCH else ""}
+END_FUNCTION xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal${"_prfm" if PREFETCH else ""}
 
 #ifdef __ELF__
 .section ".note.GNU-stack","",%progbits
diff --git a/src/qs8-igemm/2x8c16-minmax-gemmlowp-aarch64-neon-mlal-padal.S b/src/qs8-igemm/2x8c16-minmax-gemmlowp-aarch64-neon-mlal-padal.S
index bc28ebe..f1a64a5 100644
--- a/src/qs8-igemm/2x8c16-minmax-gemmlowp-aarch64-neon-mlal-padal.S
+++ b/src/qs8-igemm/2x8c16-minmax-gemmlowp-aarch64-neon-mlal-padal.S
@@ -5,7 +5,7 @@
 
 #include <xnnpack/assembly.h>
 
-# void xnn_qs8_igemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal(
+# void xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal(
 #     size_t mr,                 x0
 #     size_t nc,                 x1
 #     size_t kc,                 x2 / x0
@@ -31,7 +31,7 @@
 # temp1   v3 v11 v13 v15
 # unused  v8 v9
 
-BEGIN_FUNCTION xnn_qs8_igemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal
+BEGIN_FUNCTION xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal
 
         # Clamp C pointers
         LDP     x10, x11, [sp]          // Load cn_stride, a_offset
@@ -231,7 +231,7 @@
         LDP     d10, d11, [sp], 48
         RET
 
-END_FUNCTION xnn_qs8_igemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal
+END_FUNCTION xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal
 
 #ifdef __ELF__
 .section ".note.GNU-stack","",%progbits
diff --git a/src/qs8-igemm/2x8c8-aarch64-neon-mlal-padal-cortex-a53.S.in b/src/qs8-igemm/2x8c8-aarch64-neon-mlal-padal-cortex-a53.S.in
index 862d901..9a0b8db 100644
--- a/src/qs8-igemm/2x8c8-aarch64-neon-mlal-padal-cortex-a53.S.in
+++ b/src/qs8-igemm/2x8c8-aarch64-neon-mlal-padal-cortex-a53.S.in
@@ -5,7 +5,7 @@
 
 #include <xnnpack/assembly.h>
 
-# void xnn_qs8_igemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal${"_prfm" if PREFETCH else ""}_cortex_a53(
+# void xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal${"_prfm" if PREFETCH else ""}_cortex_a53(
 #     size_t mr,                 x0
 #     size_t nc,                 x1
 #     size_t kc,                 x2 / x0
@@ -32,7 +32,7 @@
 # x16, x17, x20, x21 tenporary a53 gpr load data
 
 
-BEGIN_FUNCTION xnn_qs8_igemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal${"_prfm" if PREFETCH else ""}_cortex_a53
+BEGIN_FUNCTION xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal${"_prfm" if PREFETCH else ""}_cortex_a53
 
         # Clamp C pointers
         LDP     x10, x11, [sp]          // Load cn_stride, a_offset
@@ -430,7 +430,7 @@
         LDP     d8, d9, [sp], 80
         RET
 
-END_FUNCTION xnn_qs8_igemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal${"_prfm" if PREFETCH else ""}_cortex_a53
+END_FUNCTION xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal${"_prfm" if PREFETCH else ""}_cortex_a53
 
 #ifdef __ELF__
 .section ".note.GNU-stack","",%progbits
diff --git a/src/qs8-igemm/2x8c8-aarch64-neon-mlal-padal.S.in b/src/qs8-igemm/2x8c8-aarch64-neon-mlal-padal.S.in
index dc9c076..4d59138 100644
--- a/src/qs8-igemm/2x8c8-aarch64-neon-mlal-padal.S.in
+++ b/src/qs8-igemm/2x8c8-aarch64-neon-mlal-padal.S.in
@@ -5,7 +5,7 @@
 
 #include <xnnpack/assembly.h>
 
-# void xnn_qs8_igemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal(
+# void xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal(
 #     size_t mr,                 x0
 #     size_t nc,                 x1
 #     size_t kc,                 x2 / x0
@@ -30,7 +30,7 @@
 # temp0   v2 v10 v12 v14
 # temp1   v3 v11 v13 v15
 
-BEGIN_FUNCTION xnn_qs8_igemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal${"_prfm" if PREFETCH else ""}
+BEGIN_FUNCTION xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal${"_prfm" if PREFETCH else ""}
 
         # Clamp C pointers
         LDP     x10, x11, [sp]          // Load cn_stride, a_offset
@@ -376,7 +376,7 @@
         LDP     d8, d9, [sp], 64
         RET
 
-END_FUNCTION xnn_qs8_igemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal${"_prfm" if PREFETCH else ""}
+END_FUNCTION xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal${"_prfm" if PREFETCH else ""}
 
 #ifdef __ELF__
 .section ".note.GNU-stack","",%progbits
diff --git a/src/qs8-igemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in b/src/qs8-igemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in
index c4b183e..03b2a58 100644
--- a/src/qs8-igemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in
+++ b/src/qs8-igemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in
@@ -5,7 +5,7 @@
 
 #include <xnnpack/assembly.h>
 
-# void xnn_qs8_igemm_minmax_ukernel_4x16__aarch64_neon_mlal_lane${"_prfm" if PREFETCH else ""}_cortex_a53(
+# void xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16__aarch64_neon_mlal_lane${"_prfm" if PREFETCH else ""}_cortex_a53(
 #     size_t mr,                 x0
 #     size_t nc,                 x1
 #     size_t kc,                 x2 / x0
@@ -35,7 +35,7 @@
 # unused  v8 v9 v10 v11 v12 v13 v14 v15
 # x8, x21 temp for Cortex-A53 loads
 
-BEGIN_FUNCTION xnn_qs8_igemm_minmax_ukernel_4x16__aarch64_neon_mlal_lane${"_prfm" if PREFETCH else ""}_cortex_a53
+BEGIN_FUNCTION xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16__aarch64_neon_mlal_lane${"_prfm" if PREFETCH else ""}_cortex_a53
 
         # Clamp C pointers
         CMP     x0, 2                   // if mr < 2
@@ -844,7 +844,7 @@
         LDP     x20, x21, [sp], 16
         RET
 
-END_FUNCTION xnn_qs8_igemm_minmax_ukernel_4x16__aarch64_neon_mlal_lane${"_prfm" if PREFETCH else ""}_cortex_a53
+END_FUNCTION xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16__aarch64_neon_mlal_lane${"_prfm" if PREFETCH else ""}_cortex_a53
 
 #ifdef __ELF__
 .section ".note.GNU-stack","",%progbits
diff --git a/src/qs8-igemm/4x16c4-minmax-gemmlowp-aarch64-neondot-cortex-a55.S b/src/qs8-igemm/4x16c4-minmax-gemmlowp-aarch64-neondot-cortex-a55.S
index 0774357..4a01759 100644
--- a/src/qs8-igemm/4x16c4-minmax-gemmlowp-aarch64-neondot-cortex-a55.S
+++ b/src/qs8-igemm/4x16c4-minmax-gemmlowp-aarch64-neondot-cortex-a55.S
@@ -5,7 +5,7 @@
 
 #include <xnnpack/assembly.h>
 
-# void xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_cortex_a55(
+# void xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55(
 #     size_t mr,                 x0
 #     size_t nc,                 x1
 #     size_t kc,                 x2 / x0
@@ -35,7 +35,7 @@
 
 # x8 temp for Cortex-A55 loads
 
-BEGIN_FUNCTION xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_cortex_a55
+BEGIN_FUNCTION xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55
 
         # Clamp C pointers
         CMP     x0, 2                   // if mr < 2
@@ -692,7 +692,7 @@
         LDR     x20, [sp], 48
         RET
 
-END_FUNCTION xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_cortex_a55
+END_FUNCTION xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55
 
 #ifdef __ELF__
 .section ".note.GNU-stack","",%progbits
diff --git a/src/qs8-igemm/4x16c4-minmax-gemmlowp-aarch64-neondot-ld64.S b/src/qs8-igemm/4x16c4-minmax-gemmlowp-aarch64-neondot-ld64.S
index ca6f97b..38b6575 100644
--- a/src/qs8-igemm/4x16c4-minmax-gemmlowp-aarch64-neondot-ld64.S
+++ b/src/qs8-igemm/4x16c4-minmax-gemmlowp-aarch64-neondot-ld64.S
@@ -5,7 +5,7 @@
 
 #include <xnnpack/assembly.h>
 
-# void xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_ld64(
+# void xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64(
 #     size_t mr,                 x0
 #     size_t nc,                 x1
 #     size_t kc,                 x2 / x0
@@ -33,7 +33,7 @@
 # C3   x7 v19 v23 v27 v31
 # unused v8 v9 v10 v11 v12 v13 v14 v15
 
-BEGIN_FUNCTION xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_ld64
+BEGIN_FUNCTION xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64
 
         # Clamp C pointers
         CMP     x0, 2                   // if mr < 2
@@ -368,7 +368,7 @@
 9:
         RET
 
-END_FUNCTION xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_ld64
+END_FUNCTION xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64
 
 #ifdef __ELF__
 .section ".note.GNU-stack","",%progbits
diff --git a/src/qs8-igemm/MRx16c8-avx512skx.c.in b/src/qs8-igemm/MRx16c8-avx512skx.c.in
index 4eddff2..44e15c5 100644
--- a/src/qs8-igemm/MRx16c8-avx512skx.c.in
+++ b/src/qs8-igemm/MRx16c8-avx512skx.c.in
@@ -16,7 +16,7 @@
 
 
 $GEMM_SUFFIX = "_xw" if VARIANT == "EXTENDED" else ""
-void xnn_qs8_igemm${GEMM_SUFFIX}_minmax_ukernel_${MR}x16c8__avx512skx(
+void xnn_qs8_igemm${GEMM_SUFFIX}_minmax_gemmlowp_ukernel_${MR}x16c8__avx512skx(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/MRx4c2-sse.c.in b/src/qs8-igemm/MRx4c2-sse.c.in
index ded0cde..fd1fa0d 100644
--- a/src/qs8-igemm/MRx4c2-sse.c.in
+++ b/src/qs8-igemm/MRx4c2-sse.c.in
@@ -27,7 +27,7 @@
 
 $PARAMS_STRUCT = "sse4" if SSE >= 4 else "sse2"
 $ISA = "xop" if XOP else "avx" if AVX else {2: "sse2", 3: "ssse3", 4: "sse41"}[SSE]
-void xnn_qs8_igemm_minmax_ukernel_${MR}x4c2__${ISA}_${VARIANT.lower()}(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_${MR}x4c2__${ISA}_${VARIANT.lower()}(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/MRx4c8-sse.c.in b/src/qs8-igemm/MRx4c8-sse.c.in
index 2357c61..4a53a24 100644
--- a/src/qs8-igemm/MRx4c8-sse.c.in
+++ b/src/qs8-igemm/MRx4c8-sse.c.in
@@ -27,7 +27,7 @@
 
 $PARAMS_STRUCT = "sse4" if SSE >= 4 else "sse2"
 $ISA = "xop" if XOP else "avx" if AVX else {2: "sse2", 3: "ssse3", 4: "sse41"}[SSE]
-void xnn_qs8_igemm_minmax_ukernel_${MR}x4c8__${ISA}_${VARIANT.lower()}(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_${MR}x4c8__${ISA}_${VARIANT.lower()}(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/MRx4c8-wasmsimd.c.in b/src/qs8-igemm/MRx4c8-wasmsimd.c.in
index 22aec9c..8703f0d 100644
--- a/src/qs8-igemm/MRx4c8-wasmsimd.c.in
+++ b/src/qs8-igemm/MRx4c8-wasmsimd.c.in
@@ -15,7 +15,7 @@
 
 $LOAD_SUFFIX = {"LD128": "_ld128", "LD64": "_ld64", "EXTENDED": ""}[VARIANT]
 $GEMM_SUFFIX = "_xw" if VARIANT == "EXTENDED" else ""
-void xnn_qs8_igemm${GEMM_SUFFIX}_minmax_ukernel_${MR}x4c8__wasmsimd${LOAD_SUFFIX}(
+void xnn_qs8_igemm${GEMM_SUFFIX}_minmax_gemmlowp_ukernel_${MR}x4c8__wasmsimd${LOAD_SUFFIX}(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/MRx8c8-avx2.c.in b/src/qs8-igemm/MRx8c8-avx2.c.in
index 34df8bb..99f1cd3 100644
--- a/src/qs8-igemm/MRx8c8-avx2.c.in
+++ b/src/qs8-igemm/MRx8c8-avx2.c.in
@@ -13,7 +13,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_${MR}x8c8__avx2(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_${MR}x8c8__avx2(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/MRxNRc4-neondot.c.in b/src/qs8-igemm/MRxNRc4-neondot.c.in
index 65765ca..f0b49b0 100644
--- a/src/qs8-igemm/MRxNRc4-neondot.c.in
+++ b/src/qs8-igemm/MRxNRc4-neondot.c.in
@@ -14,7 +14,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_${MR}x${NR}c4__neondot(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_${MR}x${NR}c4__neondot(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/c16-neon-mlal-padal.c.in b/src/qs8-igemm/c16-neon-mlal-padal.c.in
index e388b60..d02f81e 100644
--- a/src/qs8-igemm/c16-neon-mlal-padal.c.in
+++ b/src/qs8-igemm/c16-neon-mlal-padal.c.in
@@ -14,7 +14,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_${MR}x${NR}c16__neon_mlal_padal(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_${MR}x${NR}c16__neon_mlal_padal(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/c2-neon-mull-padal-dup.c.in b/src/qs8-igemm/c2-neon-mull-padal-dup.c.in
index 47ded3c..3a4b914 100644
--- a/src/qs8-igemm/c2-neon-mull-padal-dup.c.in
+++ b/src/qs8-igemm/c2-neon-mull-padal-dup.c.in
@@ -14,7 +14,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_${MR}x${NR}c2__neon_${"mlal" if MLA else "mull"}_padal_dup(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_${MR}x${NR}c2__neon_${"mlal" if MLA else "mull"}_padal_dup(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/c8-neon-mull-padal.c.in b/src/qs8-igemm/c8-neon-mull-padal.c.in
index 5ee7068..196a70e 100644
--- a/src/qs8-igemm/c8-neon-mull-padal.c.in
+++ b/src/qs8-igemm/c8-neon-mull-padal.c.in
@@ -14,7 +14,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_${MR}x${NR}c8__neon_${"mlal" if MLA else "mull"}_padal(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_${MR}x${NR}c8__neon_${"mlal" if MLA else "mull"}_padal(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/1x16-minmax-gemmlowp-neon-mlal-lane-prfm.c b/src/qs8-igemm/gen/1x16-minmax-gemmlowp-neon-mlal-lane-prfm.c
index db4b2c2..01682b2 100644
--- a/src/qs8-igemm/gen/1x16-minmax-gemmlowp-neon-mlal-lane-prfm.c
+++ b/src/qs8-igemm/gen/1x16-minmax-gemmlowp-neon-mlal-lane-prfm.c
@@ -15,7 +15,7 @@
 #include <xnnpack/igemm.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_1x16__neon_mlal_lane_prfm(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16__neon_mlal_lane_prfm(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/1x16-minmax-gemmlowp-neon-mlal-lane.c b/src/qs8-igemm/gen/1x16-minmax-gemmlowp-neon-mlal-lane.c
index 31abb20..32d6b00 100644
--- a/src/qs8-igemm/gen/1x16-minmax-gemmlowp-neon-mlal-lane.c
+++ b/src/qs8-igemm/gen/1x16-minmax-gemmlowp-neon-mlal-lane.c
@@ -15,7 +15,7 @@
 #include <xnnpack/igemm.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_1x16__neon_mlal_lane(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16__neon_mlal_lane(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/1x16-minmax-gemmlowp-neon-mull-addw-dup.c b/src/qs8-igemm/gen/1x16-minmax-gemmlowp-neon-mull-addw-dup.c
index 80eb6bf..17e0868 100644
--- a/src/qs8-igemm/gen/1x16-minmax-gemmlowp-neon-mull-addw-dup.c
+++ b/src/qs8-igemm/gen/1x16-minmax-gemmlowp-neon-mull-addw-dup.c
@@ -15,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16__neon_mull_addw_dup(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/1x16c16-minmax-gemmlowp-neon-mlal-padal.c b/src/qs8-igemm/gen/1x16c16-minmax-gemmlowp-neon-mlal-padal.c
index 78c2c1b..f8151f5 100644
--- a/src/qs8-igemm/gen/1x16c16-minmax-gemmlowp-neon-mlal-padal.c
+++ b/src/qs8-igemm/gen/1x16c16-minmax-gemmlowp-neon-mlal-padal.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal_padal(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/1x16c2-minmax-gemmlowp-neon-mlal-padal-dup.c b/src/qs8-igemm/gen/1x16c2-minmax-gemmlowp-neon-mlal-padal-dup.c
index 3302353..6a0ffb0 100644
--- a/src/qs8-igemm/gen/1x16c2-minmax-gemmlowp-neon-mlal-padal-dup.c
+++ b/src/qs8-igemm/gen/1x16c2-minmax-gemmlowp-neon-mlal-padal-dup.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c2__neon_mlal_padal_dup(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/1x16c2-minmax-gemmlowp-neon-mull-padal-dup.c b/src/qs8-igemm/gen/1x16c2-minmax-gemmlowp-neon-mull-padal-dup.c
index d0f3a9a..2e29d63 100644
--- a/src/qs8-igemm/gen/1x16c2-minmax-gemmlowp-neon-mull-padal-dup.c
+++ b/src/qs8-igemm/gen/1x16c2-minmax-gemmlowp-neon-mull-padal-dup.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c2__neon_mull_padal_dup(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/1x16c4-minmax-gemmlowp-neondot.c b/src/qs8-igemm/gen/1x16c4-minmax-gemmlowp-neondot.c
index 562b466..a6a0478 100644
--- a/src/qs8-igemm/gen/1x16c4-minmax-gemmlowp-neondot.c
+++ b/src/qs8-igemm/gen/1x16c4-minmax-gemmlowp-neondot.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c4__neondot(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/1x16c8-minmax-gemmlowp-avx512skx.c b/src/qs8-igemm/gen/1x16c8-minmax-gemmlowp-avx512skx.c
index fd29a93..4e5a727 100644
--- a/src/qs8-igemm/gen/1x16c8-minmax-gemmlowp-avx512skx.c
+++ b/src/qs8-igemm/gen/1x16c8-minmax-gemmlowp-avx512skx.c
@@ -16,7 +16,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_1x16c8__avx512skx(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__avx512skx(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/1x16c8-minmax-gemmlowp-neon-mlal-padal.c b/src/qs8-igemm/gen/1x16c8-minmax-gemmlowp-neon-mlal-padal.c
index f55054d..b2e62e0 100644
--- a/src/qs8-igemm/gen/1x16c8-minmax-gemmlowp-neon-mlal-padal.c
+++ b/src/qs8-igemm/gen/1x16c8-minmax-gemmlowp-neon-mlal-padal.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal_padal(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/1x16c8-minmax-gemmlowp-neon-mull-padal.c b/src/qs8-igemm/gen/1x16c8-minmax-gemmlowp-neon-mull-padal.c
index 266213a..ad3ce4b 100644
--- a/src/qs8-igemm/gen/1x16c8-minmax-gemmlowp-neon-mull-padal.c
+++ b/src/qs8-igemm/gen/1x16c8-minmax-gemmlowp-neon-mull-padal.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull_padal(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/1x2-minmax-gemmlowp-scalar.c b/src/qs8-igemm/gen/1x2-minmax-gemmlowp-scalar.c
index 5348c86..ec74009 100644
--- a/src/qs8-igemm/gen/1x2-minmax-gemmlowp-scalar.c
+++ b/src/qs8-igemm/gen/1x2-minmax-gemmlowp-scalar.c
@@ -14,7 +14,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_1x2__scalar(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x2__scalar(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/1x4-minmax-gemmlowp-scalar.c b/src/qs8-igemm/gen/1x4-minmax-gemmlowp-scalar.c
index d25d7f3..44afd15 100644
--- a/src/qs8-igemm/gen/1x4-minmax-gemmlowp-scalar.c
+++ b/src/qs8-igemm/gen/1x4-minmax-gemmlowp-scalar.c
@@ -14,7 +14,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_1x4__scalar(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x4__scalar(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-avx-ld128.c b/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-avx-ld128.c
index f309d27..cc0fdfd 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-avx-ld128.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-avx-ld128.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_1x4c2__avx_ld128(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x4c2__avx_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-avx-ld64.c b/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-avx-ld64.c
index 852be0e..4f2b03a 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-avx-ld64.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-avx-ld64.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_1x4c2__avx_ld64(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x4c2__avx_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-sse2-ld128.c b/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-sse2-ld128.c
index 74c720c..2f5d65c 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-sse2-ld128.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-sse2-ld128.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_1x4c2__sse2_ld128(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x4c2__sse2_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-sse2-ld64.c b/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-sse2-ld64.c
index 12f9ea0..cd7e672 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-sse2-ld64.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-sse2-ld64.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_1x4c2__sse2_ld64(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x4c2__sse2_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-sse41-ld128.c b/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-sse41-ld128.c
index f3a9c3a..bb86be8 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-sse41-ld128.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-sse41-ld128.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld128(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x4c2__sse41_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-sse41-ld64.c b/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-sse41-ld64.c
index 4e9e5f9..6660cff 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-sse41-ld64.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-sse41-ld64.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld64(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x4c2__sse41_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-ssse3-ld128.c b/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-ssse3-ld128.c
index 4380fab..952c34d 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-ssse3-ld128.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-ssse3-ld128.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_1x4c2__ssse3_ld128(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x4c2__ssse3_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-ssse3-ld64.c b/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-ssse3-ld64.c
index 655c345..6689a43 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-ssse3-ld64.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-ssse3-ld64.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_1x4c2__ssse3_ld64(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x4c2__ssse3_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-xop-ld128.c b/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-xop-ld128.c
index 95a2775..98462b9 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-xop-ld128.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-xop-ld128.c
@@ -20,7 +20,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld128(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x4c2__xop_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-xop-ld64.c b/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-xop-ld64.c
index 841e214..7db8c4a 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-xop-ld64.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-xop-ld64.c
@@ -20,7 +20,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld64(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x4c2__xop_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-avx-ld128.c b/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-avx-ld128.c
index 58d01b6..0139649 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-avx-ld128.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-avx-ld128.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld128(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x4c8__avx_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-avx-ld64.c b/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-avx-ld64.c
index d2dd638..5e41b07 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-avx-ld64.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-avx-ld64.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld64(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x4c8__avx_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-sse2-ld128.c b/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-sse2-ld128.c
index 1793705..558942f 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-sse2-ld128.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-sse2-ld128.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_1x4c8__sse2_ld128(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x4c8__sse2_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-sse2-ld64.c b/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-sse2-ld64.c
index 5d1fdae..abfed40 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-sse2-ld64.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-sse2-ld64.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_1x4c8__sse2_ld64(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x4c8__sse2_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-sse41-ld128.c b/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-sse41-ld128.c
index eaffaa6..35c7f15 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-sse41-ld128.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-sse41-ld128.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld128(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x4c8__sse41_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-sse41-ld64.c b/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-sse41-ld64.c
index ad2344f..86ea5bc 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-sse41-ld64.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-sse41-ld64.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld64(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x4c8__sse41_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-ssse3-ld128.c b/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-ssse3-ld128.c
index da9a1c1..7dce8bf 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-ssse3-ld128.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-ssse3-ld128.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_1x4c8__ssse3_ld128(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x4c8__ssse3_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-ssse3-ld64.c b/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-ssse3-ld64.c
index 520ba62..f70dfe7 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-ssse3-ld64.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-ssse3-ld64.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_1x4c8__ssse3_ld64(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x4c8__ssse3_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-wasmsimd-ld128.c b/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-wasmsimd-ld128.c
index 22c44b8..698b1c2 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-wasmsimd-ld128.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-wasmsimd-ld128.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_1x4c8__wasmsimd_ld128(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x4c8__wasmsimd_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-wasmsimd-ld64.c b/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-wasmsimd-ld64.c
index 3462440..05a1aed 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-wasmsimd-ld64.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-wasmsimd-ld64.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_1x4c8__wasmsimd_ld64(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x4c8__wasmsimd_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-xop-ld128.c b/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-xop-ld128.c
index b0646d5..e1d97fc 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-xop-ld128.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-xop-ld128.c
@@ -20,7 +20,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x4c8__xop_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-xop-ld64.c b/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-xop-ld64.c
index 7403cf2..566417e 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-xop-ld64.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-xop-ld64.c
@@ -20,7 +20,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld64(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x4c8__xop_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/1x8-minmax-gemmlowp-neon-mlal-lane-prfm.c b/src/qs8-igemm/gen/1x8-minmax-gemmlowp-neon-mlal-lane-prfm.c
index 5402416..8ece13e 100644
--- a/src/qs8-igemm/gen/1x8-minmax-gemmlowp-neon-mlal-lane-prfm.c
+++ b/src/qs8-igemm/gen/1x8-minmax-gemmlowp-neon-mlal-lane-prfm.c
@@ -15,7 +15,7 @@
 #include <xnnpack/igemm.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_1x8__neon_mlal_lane_prfm(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8__neon_mlal_lane_prfm(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/1x8-minmax-gemmlowp-neon-mlal-lane.c b/src/qs8-igemm/gen/1x8-minmax-gemmlowp-neon-mlal-lane.c
index c015e50..78eb749 100644
--- a/src/qs8-igemm/gen/1x8-minmax-gemmlowp-neon-mlal-lane.c
+++ b/src/qs8-igemm/gen/1x8-minmax-gemmlowp-neon-mlal-lane.c
@@ -15,7 +15,7 @@
 #include <xnnpack/igemm.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_1x8__neon_mlal_lane(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8__neon_mlal_lane(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/1x8-minmax-gemmlowp-neon-mull-addw-dup.c b/src/qs8-igemm/gen/1x8-minmax-gemmlowp-neon-mull-addw-dup.c
index bcf3557..b61af38 100644
--- a/src/qs8-igemm/gen/1x8-minmax-gemmlowp-neon-mull-addw-dup.c
+++ b/src/qs8-igemm/gen/1x8-minmax-gemmlowp-neon-mull-addw-dup.c
@@ -15,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_1x8__neon_mull_addw_dup(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8__neon_mull_addw_dup(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/1x8c16-minmax-gemmlowp-neon-mlal-padal.c b/src/qs8-igemm/gen/1x8c16-minmax-gemmlowp-neon-mlal-padal.c
index 8138af7..706e229 100644
--- a/src/qs8-igemm/gen/1x8c16-minmax-gemmlowp-neon-mlal-padal.c
+++ b/src/qs8-igemm/gen/1x8c16-minmax-gemmlowp-neon-mlal-padal.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_1x8c16__neon_mlal_padal(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal_padal(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/1x8c2-minmax-gemmlowp-neon-mlal-padal-dup.c b/src/qs8-igemm/gen/1x8c2-minmax-gemmlowp-neon-mlal-padal-dup.c
index 0cdfb22..c3cd147 100644
--- a/src/qs8-igemm/gen/1x8c2-minmax-gemmlowp-neon-mlal-padal-dup.c
+++ b/src/qs8-igemm/gen/1x8c2-minmax-gemmlowp-neon-mlal-padal-dup.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_padal_dup(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/1x8c2-minmax-gemmlowp-neon-mull-padal-dup.c b/src/qs8-igemm/gen/1x8c2-minmax-gemmlowp-neon-mull-padal-dup.c
index c4e7719..12abba6 100644
--- a/src/qs8-igemm/gen/1x8c2-minmax-gemmlowp-neon-mull-padal-dup.c
+++ b/src/qs8-igemm/gen/1x8c2-minmax-gemmlowp-neon-mull-padal-dup.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mull_padal_dup(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c2__neon_mull_padal_dup(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/1x8c4-minmax-gemmlowp-neondot.c b/src/qs8-igemm/gen/1x8c4-minmax-gemmlowp-neondot.c
index 462f5f8..3c36334 100644
--- a/src/qs8-igemm/gen/1x8c4-minmax-gemmlowp-neondot.c
+++ b/src/qs8-igemm/gen/1x8c4-minmax-gemmlowp-neondot.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_1x8c4__neondot(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c4__neondot(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-cortex-a53.S b/src/qs8-igemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-cortex-a53.S
index 191f29c..22f6dce 100644
--- a/src/qs8-igemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-cortex-a53.S
+++ b/src/qs8-igemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-cortex-a53.S
@@ -9,7 +9,7 @@
 
 #include <xnnpack/assembly.h>
 
-# void xnn_qs8_igemm_minmax_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53(
+# void xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53(
 #     size_t mr,                 x0
 #     size_t nc,                 x1
 #     size_t kc,                 x2 / x0
@@ -33,7 +33,7 @@
 # x16, x17, x7 tenporary a53 gpr load data
 
 
-BEGIN_FUNCTION xnn_qs8_igemm_minmax_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53
+BEGIN_FUNCTION xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53
 
         # Clamp C pointers
         LDP     x10, x11, [sp]          // Load cn_stride, a_offset
@@ -290,7 +290,7 @@
 9:
         RET
 
-END_FUNCTION xnn_qs8_igemm_minmax_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53
+END_FUNCTION xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53
 
 #ifdef __ELF__
 .section ".note.GNU-stack","",%progbits
diff --git a/src/qs8-igemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm-cortex-a53.S b/src/qs8-igemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm-cortex-a53.S
index 60e5690..7432bc9 100644
--- a/src/qs8-igemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm-cortex-a53.S
+++ b/src/qs8-igemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm-cortex-a53.S
@@ -9,7 +9,7 @@
 
 #include <xnnpack/assembly.h>
 
-# void xnn_qs8_igemm_minmax_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53(
+# void xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53(
 #     size_t mr,                 x0
 #     size_t nc,                 x1
 #     size_t kc,                 x2 / x0
@@ -33,7 +33,7 @@
 # x16, x17, x7 tenporary a53 gpr load data
 
 
-BEGIN_FUNCTION xnn_qs8_igemm_minmax_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53
+BEGIN_FUNCTION xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53
 
         # Clamp C pointers
         LDP     x10, x11, [sp]          // Load cn_stride, a_offset
@@ -293,7 +293,7 @@
 9:
         RET
 
-END_FUNCTION xnn_qs8_igemm_minmax_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53
+END_FUNCTION xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53
 
 #ifdef __ELF__
 .section ".note.GNU-stack","",%progbits
diff --git a/src/qs8-igemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm.S b/src/qs8-igemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm.S
index 91b25b2..b8b8fdc 100644
--- a/src/qs8-igemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm.S
+++ b/src/qs8-igemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm.S
@@ -9,7 +9,7 @@
 
 #include <xnnpack/assembly.h>
 
-# void xnn_qs8_igemm_minmax_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm(
+# void xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm(
 #     size_t mr,                 x0
 #     size_t nc,                 x1
 #     size_t kc,                 x2 / x0
@@ -32,7 +32,7 @@
 # temp0  v17 v19 v21 v23
 
 
-BEGIN_FUNCTION xnn_qs8_igemm_minmax_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm
+BEGIN_FUNCTION xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm
 
         # Clamp C pointers
         LDP     x10, x11, [sp]          // Load cn_stride, a_offset
@@ -266,7 +266,7 @@
 9:
         RET
 
-END_FUNCTION xnn_qs8_igemm_minmax_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm
+END_FUNCTION xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm
 
 #ifdef __ELF__
 .section ".note.GNU-stack","",%progbits
diff --git a/src/qs8-igemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal.S b/src/qs8-igemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal.S
index 6d9f584..177d0a3 100644
--- a/src/qs8-igemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal.S
+++ b/src/qs8-igemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal.S
@@ -9,7 +9,7 @@
 
 #include <xnnpack/assembly.h>
 
-# void xnn_qs8_igemm_minmax_ukernel_1x8c8__aarch64_neon_mlal_padal(
+# void xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal(
 #     size_t mr,                 x0
 #     size_t nc,                 x1
 #     size_t kc,                 x2 / x0
@@ -32,7 +32,7 @@
 # temp0  v17 v19 v21 v23
 
 
-BEGIN_FUNCTION xnn_qs8_igemm_minmax_ukernel_1x8c8__aarch64_neon_mlal_padal
+BEGIN_FUNCTION xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal
 
         # Clamp C pointers
         LDP     x10, x11, [sp]          // Load cn_stride, a_offset
@@ -260,7 +260,7 @@
 9:
         RET
 
-END_FUNCTION xnn_qs8_igemm_minmax_ukernel_1x8c8__aarch64_neon_mlal_padal
+END_FUNCTION xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal
 
 #ifdef __ELF__
 .section ".note.GNU-stack","",%progbits
diff --git a/src/qs8-igemm/gen/1x8c8-minmax-gemmlowp-avx2.c b/src/qs8-igemm/gen/1x8c8-minmax-gemmlowp-avx2.c
index b988ae6..6425aab 100644
--- a/src/qs8-igemm/gen/1x8c8-minmax-gemmlowp-avx2.c
+++ b/src/qs8-igemm/gen/1x8c8-minmax-gemmlowp-avx2.c
@@ -16,7 +16,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__avx2(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/1x8c8-minmax-gemmlowp-neon-mlal-padal.c b/src/qs8-igemm/gen/1x8c8-minmax-gemmlowp-neon-mlal-padal.c
index d9d019c..13106af 100644
--- a/src/qs8-igemm/gen/1x8c8-minmax-gemmlowp-neon-mlal-padal.c
+++ b/src/qs8-igemm/gen/1x8c8-minmax-gemmlowp-neon-mlal-padal.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mlal_padal(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal_padal(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/1x8c8-minmax-gemmlowp-neon-mull-padal.c b/src/qs8-igemm/gen/1x8c8-minmax-gemmlowp-neon-mull-padal.c
index 4ba8e9d..1005f20 100644
--- a/src/qs8-igemm/gen/1x8c8-minmax-gemmlowp-neon-mull-padal.c
+++ b/src/qs8-igemm/gen/1x8c8-minmax-gemmlowp-neon-mull-padal.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mull_padal(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull_padal(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/2x16-minmax-gemmlowp-neon-mlal-lane-prfm.c b/src/qs8-igemm/gen/2x16-minmax-gemmlowp-neon-mlal-lane-prfm.c
index 5e44b61..d016b77 100644
--- a/src/qs8-igemm/gen/2x16-minmax-gemmlowp-neon-mlal-lane-prfm.c
+++ b/src/qs8-igemm/gen/2x16-minmax-gemmlowp-neon-mlal-lane-prfm.c
@@ -15,7 +15,7 @@
 #include <xnnpack/igemm.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane_prfm(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16__neon_mlal_lane_prfm(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/2x16-minmax-gemmlowp-neon-mlal-lane.c b/src/qs8-igemm/gen/2x16-minmax-gemmlowp-neon-mlal-lane.c
index 5e43c14..141a90f 100644
--- a/src/qs8-igemm/gen/2x16-minmax-gemmlowp-neon-mlal-lane.c
+++ b/src/qs8-igemm/gen/2x16-minmax-gemmlowp-neon-mlal-lane.c
@@ -15,7 +15,7 @@
 #include <xnnpack/igemm.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16__neon_mlal_lane(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/2x16-minmax-gemmlowp-neon-mull-addw-dup.c b/src/qs8-igemm/gen/2x16-minmax-gemmlowp-neon-mull-addw-dup.c
index b4ca64f..36633c2 100644
--- a/src/qs8-igemm/gen/2x16-minmax-gemmlowp-neon-mull-addw-dup.c
+++ b/src/qs8-igemm/gen/2x16-minmax-gemmlowp-neon-mull-addw-dup.c
@@ -15,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16__neon_mull_addw_dup(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/2x16c16-minmax-gemmlowp-neon-mlal-padal.c b/src/qs8-igemm/gen/2x16c16-minmax-gemmlowp-neon-mlal-padal.c
index 1e09bef..87ac69d 100644
--- a/src/qs8-igemm/gen/2x16c16-minmax-gemmlowp-neon-mlal-padal.c
+++ b/src/qs8-igemm/gen/2x16c16-minmax-gemmlowp-neon-mlal-padal.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal_padal(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/2x16c2-minmax-gemmlowp-neon-mlal-padal-dup.c b/src/qs8-igemm/gen/2x16c2-minmax-gemmlowp-neon-mlal-padal-dup.c
index 9a43b7f..747974b 100644
--- a/src/qs8-igemm/gen/2x16c2-minmax-gemmlowp-neon-mlal-padal-dup.c
+++ b/src/qs8-igemm/gen/2x16c2-minmax-gemmlowp-neon-mlal-padal-dup.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c2__neon_mlal_padal_dup(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/2x16c2-minmax-gemmlowp-neon-mull-padal-dup.c b/src/qs8-igemm/gen/2x16c2-minmax-gemmlowp-neon-mull-padal-dup.c
index 8b1aa64..378be07 100644
--- a/src/qs8-igemm/gen/2x16c2-minmax-gemmlowp-neon-mull-padal-dup.c
+++ b/src/qs8-igemm/gen/2x16c2-minmax-gemmlowp-neon-mull-padal-dup.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c2__neon_mull_padal_dup(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/2x16c8-minmax-gemmlowp-avx512skx.c b/src/qs8-igemm/gen/2x16c8-minmax-gemmlowp-avx512skx.c
index 1a3b682..7df5372 100644
--- a/src/qs8-igemm/gen/2x16c8-minmax-gemmlowp-avx512skx.c
+++ b/src/qs8-igemm/gen/2x16c8-minmax-gemmlowp-avx512skx.c
@@ -16,7 +16,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_2x16c8__avx512skx(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__avx512skx(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/2x16c8-minmax-gemmlowp-neon-mlal-padal.c b/src/qs8-igemm/gen/2x16c8-minmax-gemmlowp-neon-mlal-padal.c
index cb9def0..9f81ab2 100644
--- a/src/qs8-igemm/gen/2x16c8-minmax-gemmlowp-neon-mlal-padal.c
+++ b/src/qs8-igemm/gen/2x16c8-minmax-gemmlowp-neon-mlal-padal.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal_padal(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/2x16c8-minmax-gemmlowp-neon-mull-padal.c b/src/qs8-igemm/gen/2x16c8-minmax-gemmlowp-neon-mull-padal.c
index f38918b..7e67595 100644
--- a/src/qs8-igemm/gen/2x16c8-minmax-gemmlowp-neon-mull-padal.c
+++ b/src/qs8-igemm/gen/2x16c8-minmax-gemmlowp-neon-mull-padal.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull_padal(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/2x2-minmax-gemmlowp-scalar.c b/src/qs8-igemm/gen/2x2-minmax-gemmlowp-scalar.c
index 3df4ea9..d00969c 100644
--- a/src/qs8-igemm/gen/2x2-minmax-gemmlowp-scalar.c
+++ b/src/qs8-igemm/gen/2x2-minmax-gemmlowp-scalar.c
@@ -14,7 +14,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_2x2__scalar(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x2__scalar(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/2x4-minmax-gemmlowp-scalar.c b/src/qs8-igemm/gen/2x4-minmax-gemmlowp-scalar.c
index 41f6d3b..604724d 100644
--- a/src/qs8-igemm/gen/2x4-minmax-gemmlowp-scalar.c
+++ b/src/qs8-igemm/gen/2x4-minmax-gemmlowp-scalar.c
@@ -14,7 +14,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_2x4__scalar(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x4__scalar(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-avx-ld128.c b/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-avx-ld128.c
index cc08b92..21c2ab0 100644
--- a/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-avx-ld128.c
+++ b/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-avx-ld128.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_2x4c2__avx_ld128(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x4c2__avx_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-avx-ld64.c b/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-avx-ld64.c
index 21823a3..8c732d0 100644
--- a/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-avx-ld64.c
+++ b/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-avx-ld64.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_2x4c2__avx_ld64(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x4c2__avx_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-sse2-ld128.c b/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-sse2-ld128.c
index 89f3698..f44e287 100644
--- a/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-sse2-ld128.c
+++ b/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-sse2-ld128.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_2x4c2__sse2_ld128(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x4c2__sse2_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-sse2-ld64.c b/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-sse2-ld64.c
index d59bf28..f9c23a9 100644
--- a/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-sse2-ld64.c
+++ b/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-sse2-ld64.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_2x4c2__sse2_ld64(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x4c2__sse2_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-sse41-ld128.c b/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-sse41-ld128.c
index b852014..ccc6837 100644
--- a/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-sse41-ld128.c
+++ b/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-sse41-ld128.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld128(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x4c2__sse41_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-sse41-ld64.c b/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-sse41-ld64.c
index 4532c35..6574638 100644
--- a/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-sse41-ld64.c
+++ b/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-sse41-ld64.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld64(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x4c2__sse41_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-ssse3-ld128.c b/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-ssse3-ld128.c
index 10fc1e4..3e938c5 100644
--- a/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-ssse3-ld128.c
+++ b/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-ssse3-ld128.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_2x4c2__ssse3_ld128(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x4c2__ssse3_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-ssse3-ld64.c b/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-ssse3-ld64.c
index 175955b..2c7bb65 100644
--- a/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-ssse3-ld64.c
+++ b/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-ssse3-ld64.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_2x4c2__ssse3_ld64(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x4c2__ssse3_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-xop-ld128.c b/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-xop-ld128.c
index 71b9cab..bc8de63 100644
--- a/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-xop-ld128.c
+++ b/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-xop-ld128.c
@@ -20,7 +20,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld128(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x4c2__xop_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-xop-ld64.c b/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-xop-ld64.c
index 88649df..d7ba893 100644
--- a/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-xop-ld64.c
+++ b/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-xop-ld64.c
@@ -20,7 +20,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld64(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x4c2__xop_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-avx-ld128.c b/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-avx-ld128.c
index 9307dc8..d5e1ccc 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-avx-ld128.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-avx-ld128.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld128(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x4c8__avx_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-avx-ld64.c b/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-avx-ld64.c
index b92f417..b0da08e 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-avx-ld64.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-avx-ld64.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld64(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x4c8__avx_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-sse2-ld128.c b/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-sse2-ld128.c
index 9304588..68a37f8 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-sse2-ld128.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-sse2-ld128.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_2x4c8__sse2_ld128(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x4c8__sse2_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-sse2-ld64.c b/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-sse2-ld64.c
index c6773a8..ab3cb2a 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-sse2-ld64.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-sse2-ld64.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_2x4c8__sse2_ld64(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x4c8__sse2_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-sse41-ld128.c b/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-sse41-ld128.c
index 0c1f2ab..d347a52 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-sse41-ld128.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-sse41-ld128.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld128(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x4c8__sse41_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-sse41-ld64.c b/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-sse41-ld64.c
index 96af574..7297ec2 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-sse41-ld64.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-sse41-ld64.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld64(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x4c8__sse41_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-ssse3-ld128.c b/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-ssse3-ld128.c
index 95a3f59..8595f87 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-ssse3-ld128.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-ssse3-ld128.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_2x4c8__ssse3_ld128(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x4c8__ssse3_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-ssse3-ld64.c b/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-ssse3-ld64.c
index ec11e48..72c693b 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-ssse3-ld64.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-ssse3-ld64.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_2x4c8__ssse3_ld64(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x4c8__ssse3_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-wasmsimd-ld128.c b/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-wasmsimd-ld128.c
index 48c61b0..eef58f3 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-wasmsimd-ld128.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-wasmsimd-ld128.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_2x4c8__wasmsimd_ld128(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x4c8__wasmsimd_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-wasmsimd-ld64.c b/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-wasmsimd-ld64.c
index fd14cfa..0a27efb 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-wasmsimd-ld64.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-wasmsimd-ld64.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_2x4c8__wasmsimd_ld64(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x4c8__wasmsimd_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-xop-ld128.c b/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-xop-ld128.c
index bd3c5a7..7aca948 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-xop-ld128.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-xop-ld128.c
@@ -20,7 +20,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld128(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x4c8__xop_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-xop-ld64.c b/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-xop-ld64.c
index bb0a7c2..3c84d46 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-xop-ld64.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-xop-ld64.c
@@ -20,7 +20,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld64(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x4c8__xop_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/2x8-minmax-gemmlowp-neon-mlal-lane-prfm.c b/src/qs8-igemm/gen/2x8-minmax-gemmlowp-neon-mlal-lane-prfm.c
index e00f237..5bf0385 100644
--- a/src/qs8-igemm/gen/2x8-minmax-gemmlowp-neon-mlal-lane-prfm.c
+++ b/src/qs8-igemm/gen/2x8-minmax-gemmlowp-neon-mlal-lane-prfm.c
@@ -15,7 +15,7 @@
 #include <xnnpack/igemm.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_2x8__neon_mlal_lane_prfm(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8__neon_mlal_lane_prfm(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/2x8-minmax-gemmlowp-neon-mlal-lane.c b/src/qs8-igemm/gen/2x8-minmax-gemmlowp-neon-mlal-lane.c
index 17a491a..f85fbae 100644
--- a/src/qs8-igemm/gen/2x8-minmax-gemmlowp-neon-mlal-lane.c
+++ b/src/qs8-igemm/gen/2x8-minmax-gemmlowp-neon-mlal-lane.c
@@ -15,7 +15,7 @@
 #include <xnnpack/igemm.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_2x8__neon_mlal_lane(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8__neon_mlal_lane(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/2x8-minmax-gemmlowp-neon-mull-addw-dup.c b/src/qs8-igemm/gen/2x8-minmax-gemmlowp-neon-mull-addw-dup.c
index c1e4562..263868b 100644
--- a/src/qs8-igemm/gen/2x8-minmax-gemmlowp-neon-mull-addw-dup.c
+++ b/src/qs8-igemm/gen/2x8-minmax-gemmlowp-neon-mull-addw-dup.c
@@ -15,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_2x8__neon_mull_addw_dup(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8__neon_mull_addw_dup(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/2x8c16-minmax-gemmlowp-neon-mlal-padal.c b/src/qs8-igemm/gen/2x8c16-minmax-gemmlowp-neon-mlal-padal.c
index 3066e60..6c951af 100644
--- a/src/qs8-igemm/gen/2x8c16-minmax-gemmlowp-neon-mlal-padal.c
+++ b/src/qs8-igemm/gen/2x8c16-minmax-gemmlowp-neon-mlal-padal.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_2x8c16__neon_mlal_padal(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal_padal(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/2x8c2-minmax-gemmlowp-neon-mlal-padal-dup.c b/src/qs8-igemm/gen/2x8c2-minmax-gemmlowp-neon-mlal-padal-dup.c
index 9545632..082533b 100644
--- a/src/qs8-igemm/gen/2x8c2-minmax-gemmlowp-neon-mlal-padal-dup.c
+++ b/src/qs8-igemm/gen/2x8c2-minmax-gemmlowp-neon-mlal-padal-dup.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_padal_dup(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/2x8c2-minmax-gemmlowp-neon-mull-padal-dup.c b/src/qs8-igemm/gen/2x8c2-minmax-gemmlowp-neon-mull-padal-dup.c
index a62dc26..1c9c0f5 100644
--- a/src/qs8-igemm/gen/2x8c2-minmax-gemmlowp-neon-mull-padal-dup.c
+++ b/src/qs8-igemm/gen/2x8c2-minmax-gemmlowp-neon-mull-padal-dup.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mull_padal_dup(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c2__neon_mull_padal_dup(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-cortex-a53.S b/src/qs8-igemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-cortex-a53.S
index badd8a2..dd54ced 100644
--- a/src/qs8-igemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-cortex-a53.S
+++ b/src/qs8-igemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-cortex-a53.S
@@ -9,7 +9,7 @@
 
 #include <xnnpack/assembly.h>
 
-# void xnn_qs8_igemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53(
+# void xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53(
 #     size_t mr,                 x0
 #     size_t nc,                 x1
 #     size_t kc,                 x2 / x0
@@ -36,7 +36,7 @@
 # x16, x17, x20, x21 tenporary a53 gpr load data
 
 
-BEGIN_FUNCTION xnn_qs8_igemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53
+BEGIN_FUNCTION xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53
 
         # Clamp C pointers
         LDP     x10, x11, [sp]          // Load cn_stride, a_offset
@@ -426,7 +426,7 @@
         LDP     d8, d9, [sp], 80
         RET
 
-END_FUNCTION xnn_qs8_igemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53
+END_FUNCTION xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53
 
 #ifdef __ELF__
 .section ".note.GNU-stack","",%progbits
diff --git a/src/qs8-igemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm-cortex-a53.S b/src/qs8-igemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm-cortex-a53.S
index fe63272..819b9ac 100644
--- a/src/qs8-igemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm-cortex-a53.S
+++ b/src/qs8-igemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm-cortex-a53.S
@@ -9,7 +9,7 @@
 
 #include <xnnpack/assembly.h>
 
-# void xnn_qs8_igemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53(
+# void xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53(
 #     size_t mr,                 x0
 #     size_t nc,                 x1
 #     size_t kc,                 x2 / x0
@@ -36,7 +36,7 @@
 # x16, x17, x20, x21 tenporary a53 gpr load data
 
 
-BEGIN_FUNCTION xnn_qs8_igemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53
+BEGIN_FUNCTION xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53
 
         # Clamp C pointers
         LDP     x10, x11, [sp]          // Load cn_stride, a_offset
@@ -430,7 +430,7 @@
         LDP     d8, d9, [sp], 80
         RET
 
-END_FUNCTION xnn_qs8_igemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53
+END_FUNCTION xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53
 
 #ifdef __ELF__
 .section ".note.GNU-stack","",%progbits
diff --git a/src/qs8-igemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm.S b/src/qs8-igemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm.S
index b38fa1e..8c28b34 100644
--- a/src/qs8-igemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm.S
+++ b/src/qs8-igemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm.S
@@ -9,7 +9,7 @@
 
 #include <xnnpack/assembly.h>
 
-# void xnn_qs8_igemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal(
+# void xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal(
 #     size_t mr,                 x0
 #     size_t nc,                 x1
 #     size_t kc,                 x2 / x0
@@ -34,7 +34,7 @@
 # temp0   v2 v10 v12 v14
 # temp1   v3 v11 v13 v15
 
-BEGIN_FUNCTION xnn_qs8_igemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm
+BEGIN_FUNCTION xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm
 
         # Clamp C pointers
         LDP     x10, x11, [sp]          // Load cn_stride, a_offset
@@ -376,7 +376,7 @@
         LDP     d8, d9, [sp], 64
         RET
 
-END_FUNCTION xnn_qs8_igemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm
+END_FUNCTION xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm
 
 #ifdef __ELF__
 .section ".note.GNU-stack","",%progbits
diff --git a/src/qs8-igemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal.S b/src/qs8-igemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal.S
index ce266a3..e2a0b28 100644
--- a/src/qs8-igemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal.S
+++ b/src/qs8-igemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal.S
@@ -9,7 +9,7 @@
 
 #include <xnnpack/assembly.h>
 
-# void xnn_qs8_igemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal(
+# void xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal(
 #     size_t mr,                 x0
 #     size_t nc,                 x1
 #     size_t kc,                 x2 / x0
@@ -34,7 +34,7 @@
 # temp0   v2 v10 v12 v14
 # temp1   v3 v11 v13 v15
 
-BEGIN_FUNCTION xnn_qs8_igemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal
+BEGIN_FUNCTION xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal
 
         # Clamp C pointers
         LDP     x10, x11, [sp]          // Load cn_stride, a_offset
@@ -372,7 +372,7 @@
         LDP     d8, d9, [sp], 64
         RET
 
-END_FUNCTION xnn_qs8_igemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal
+END_FUNCTION xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal
 
 #ifdef __ELF__
 .section ".note.GNU-stack","",%progbits
diff --git a/src/qs8-igemm/gen/2x8c8-minmax-gemmlowp-avx2.c b/src/qs8-igemm/gen/2x8c8-minmax-gemmlowp-avx2.c
index b33bfa9..caca667 100644
--- a/src/qs8-igemm/gen/2x8c8-minmax-gemmlowp-avx2.c
+++ b/src/qs8-igemm/gen/2x8c8-minmax-gemmlowp-avx2.c
@@ -16,7 +16,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__avx2(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/2x8c8-minmax-gemmlowp-neon-mlal-padal.c b/src/qs8-igemm/gen/2x8c8-minmax-gemmlowp-neon-mlal-padal.c
index 1f2bf2d..a3af826 100644
--- a/src/qs8-igemm/gen/2x8c8-minmax-gemmlowp-neon-mlal-padal.c
+++ b/src/qs8-igemm/gen/2x8c8-minmax-gemmlowp-neon-mlal-padal.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal_padal(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/2x8c8-minmax-gemmlowp-neon-mull-padal.c b/src/qs8-igemm/gen/2x8c8-minmax-gemmlowp-neon-mull-padal.c
index 5f109d6..31bda68 100644
--- a/src/qs8-igemm/gen/2x8c8-minmax-gemmlowp-neon-mull-padal.c
+++ b/src/qs8-igemm/gen/2x8c8-minmax-gemmlowp-neon-mull-padal.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mull_padal(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull_padal(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/3x16-minmax-gemmlowp-neon-mlal-lane-prfm.c b/src/qs8-igemm/gen/3x16-minmax-gemmlowp-neon-mlal-lane-prfm.c
index f6886b6..f61a9de 100644
--- a/src/qs8-igemm/gen/3x16-minmax-gemmlowp-neon-mlal-lane-prfm.c
+++ b/src/qs8-igemm/gen/3x16-minmax-gemmlowp-neon-mlal-lane-prfm.c
@@ -15,7 +15,7 @@
 #include <xnnpack/igemm.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane_prfm(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16__neon_mlal_lane_prfm(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/3x16-minmax-gemmlowp-neon-mlal-lane.c b/src/qs8-igemm/gen/3x16-minmax-gemmlowp-neon-mlal-lane.c
index 68c61de..0a0cb9e 100644
--- a/src/qs8-igemm/gen/3x16-minmax-gemmlowp-neon-mlal-lane.c
+++ b/src/qs8-igemm/gen/3x16-minmax-gemmlowp-neon-mlal-lane.c
@@ -15,7 +15,7 @@
 #include <xnnpack/igemm.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16__neon_mlal_lane(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/3x16-minmax-gemmlowp-neon-mull-addw-dup.c b/src/qs8-igemm/gen/3x16-minmax-gemmlowp-neon-mull-addw-dup.c
index ddd6d7d..afc1431 100644
--- a/src/qs8-igemm/gen/3x16-minmax-gemmlowp-neon-mull-addw-dup.c
+++ b/src/qs8-igemm/gen/3x16-minmax-gemmlowp-neon-mull-addw-dup.c
@@ -15,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16__neon_mull_addw_dup(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/3x16c16-minmax-gemmlowp-neon-mlal-padal.c b/src/qs8-igemm/gen/3x16c16-minmax-gemmlowp-neon-mlal-padal.c
index e382f8b..9662891 100644
--- a/src/qs8-igemm/gen/3x16c16-minmax-gemmlowp-neon-mlal-padal.c
+++ b/src/qs8-igemm/gen/3x16c16-minmax-gemmlowp-neon-mlal-padal.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal_padal(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/3x16c2-minmax-gemmlowp-neon-mlal-padal-dup.c b/src/qs8-igemm/gen/3x16c2-minmax-gemmlowp-neon-mlal-padal-dup.c
index 4bb026f..eb37a3d 100644
--- a/src/qs8-igemm/gen/3x16c2-minmax-gemmlowp-neon-mlal-padal-dup.c
+++ b/src/qs8-igemm/gen/3x16c2-minmax-gemmlowp-neon-mlal-padal-dup.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c2__neon_mlal_padal_dup(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/3x16c2-minmax-gemmlowp-neon-mull-padal-dup.c b/src/qs8-igemm/gen/3x16c2-minmax-gemmlowp-neon-mull-padal-dup.c
index 6d14058..f176861 100644
--- a/src/qs8-igemm/gen/3x16c2-minmax-gemmlowp-neon-mull-padal-dup.c
+++ b/src/qs8-igemm/gen/3x16c2-minmax-gemmlowp-neon-mull-padal-dup.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c2__neon_mull_padal_dup(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/3x16c8-minmax-gemmlowp-avx512skx.c b/src/qs8-igemm/gen/3x16c8-minmax-gemmlowp-avx512skx.c
index 73d32f0..c4d719b 100644
--- a/src/qs8-igemm/gen/3x16c8-minmax-gemmlowp-avx512skx.c
+++ b/src/qs8-igemm/gen/3x16c8-minmax-gemmlowp-avx512skx.c
@@ -16,7 +16,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_3x16c8__avx512skx(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__avx512skx(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/3x16c8-minmax-gemmlowp-neon-mlal-padal.c b/src/qs8-igemm/gen/3x16c8-minmax-gemmlowp-neon-mlal-padal.c
index fe9e909..eebeef7 100644
--- a/src/qs8-igemm/gen/3x16c8-minmax-gemmlowp-neon-mlal-padal.c
+++ b/src/qs8-igemm/gen/3x16c8-minmax-gemmlowp-neon-mlal-padal.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal_padal(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/3x16c8-minmax-gemmlowp-neon-mull-padal.c b/src/qs8-igemm/gen/3x16c8-minmax-gemmlowp-neon-mull-padal.c
index d183585..be2f12e 100644
--- a/src/qs8-igemm/gen/3x16c8-minmax-gemmlowp-neon-mull-padal.c
+++ b/src/qs8-igemm/gen/3x16c8-minmax-gemmlowp-neon-mull-padal.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull_padal(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/3x2-minmax-gemmlowp-scalar.c b/src/qs8-igemm/gen/3x2-minmax-gemmlowp-scalar.c
index 26376a3..c7e5fff 100644
--- a/src/qs8-igemm/gen/3x2-minmax-gemmlowp-scalar.c
+++ b/src/qs8-igemm/gen/3x2-minmax-gemmlowp-scalar.c
@@ -14,7 +14,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_3x2__scalar(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x2__scalar(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/3x4-minmax-gemmlowp-scalar.c b/src/qs8-igemm/gen/3x4-minmax-gemmlowp-scalar.c
index 37be2a5..5a5d479 100644
--- a/src/qs8-igemm/gen/3x4-minmax-gemmlowp-scalar.c
+++ b/src/qs8-igemm/gen/3x4-minmax-gemmlowp-scalar.c
@@ -14,7 +14,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_3x4__scalar(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x4__scalar(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-avx-ld128.c b/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-avx-ld128.c
index 07170b4..32e68f2 100644
--- a/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-avx-ld128.c
+++ b/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-avx-ld128.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_3x4c2__avx_ld128(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x4c2__avx_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-avx-ld64.c b/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-avx-ld64.c
index a6224db..d264f88 100644
--- a/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-avx-ld64.c
+++ b/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-avx-ld64.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_3x4c2__avx_ld64(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x4c2__avx_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-sse2-ld128.c b/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-sse2-ld128.c
index 991e596..9dcad31 100644
--- a/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-sse2-ld128.c
+++ b/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-sse2-ld128.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_3x4c2__sse2_ld128(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x4c2__sse2_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-sse2-ld64.c b/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-sse2-ld64.c
index 1dff2c6..ecf380a 100644
--- a/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-sse2-ld64.c
+++ b/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-sse2-ld64.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_3x4c2__sse2_ld64(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x4c2__sse2_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-sse41-ld128.c b/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-sse41-ld128.c
index e9ac047..a615fc1 100644
--- a/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-sse41-ld128.c
+++ b/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-sse41-ld128.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld128(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x4c2__sse41_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-sse41-ld64.c b/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-sse41-ld64.c
index cf64636..f3b352b 100644
--- a/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-sse41-ld64.c
+++ b/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-sse41-ld64.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld64(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x4c2__sse41_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-ssse3-ld128.c b/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-ssse3-ld128.c
index fedb5a0..e825bef 100644
--- a/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-ssse3-ld128.c
+++ b/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-ssse3-ld128.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_3x4c2__ssse3_ld128(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x4c2__ssse3_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-ssse3-ld64.c b/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-ssse3-ld64.c
index 0a105fb..86c9cbe 100644
--- a/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-ssse3-ld64.c
+++ b/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-ssse3-ld64.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_3x4c2__ssse3_ld64(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x4c2__ssse3_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-xop-ld128.c b/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-xop-ld128.c
index 6fd187b..9e4a7d8 100644
--- a/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-xop-ld128.c
+++ b/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-xop-ld128.c
@@ -20,7 +20,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld128(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x4c2__xop_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-xop-ld64.c b/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-xop-ld64.c
index c0a46f4..7414a94 100644
--- a/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-xop-ld64.c
+++ b/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-xop-ld64.c
@@ -20,7 +20,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld64(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x4c2__xop_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-avx-ld128.c b/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-avx-ld128.c
index 7d5fff1..36af6c3 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-avx-ld128.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-avx-ld128.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld128(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x4c8__avx_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-avx-ld64.c b/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-avx-ld64.c
index 330eaa0..cb70d63 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-avx-ld64.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-avx-ld64.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld64(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x4c8__avx_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-sse2-ld128.c b/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-sse2-ld128.c
index 0022e00..284079b 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-sse2-ld128.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-sse2-ld128.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_3x4c8__sse2_ld128(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x4c8__sse2_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-sse2-ld64.c b/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-sse2-ld64.c
index 83d4334..5a6247f 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-sse2-ld64.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-sse2-ld64.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_3x4c8__sse2_ld64(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x4c8__sse2_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-sse41-ld128.c b/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-sse41-ld128.c
index a39c3ab..79a11eb 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-sse41-ld128.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-sse41-ld128.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld128(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x4c8__sse41_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-sse41-ld64.c b/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-sse41-ld64.c
index 82eec9e..ee3e7cc 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-sse41-ld64.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-sse41-ld64.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld64(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x4c8__sse41_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-ssse3-ld128.c b/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-ssse3-ld128.c
index 0a35711..7373193 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-ssse3-ld128.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-ssse3-ld128.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_3x4c8__ssse3_ld128(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x4c8__ssse3_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-ssse3-ld64.c b/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-ssse3-ld64.c
index 27e040c..d6dfef6 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-ssse3-ld64.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-ssse3-ld64.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_3x4c8__ssse3_ld64(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x4c8__ssse3_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-wasmsimd-ld128.c b/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-wasmsimd-ld128.c
index 1ef1508..f75f48e 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-wasmsimd-ld128.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-wasmsimd-ld128.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_3x4c8__wasmsimd_ld128(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x4c8__wasmsimd_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-wasmsimd-ld64.c b/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-wasmsimd-ld64.c
index 822c65a..85b4a7d 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-wasmsimd-ld64.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-wasmsimd-ld64.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_3x4c8__wasmsimd_ld64(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x4c8__wasmsimd_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-xop-ld128.c b/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-xop-ld128.c
index 96d42c0..53f2195 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-xop-ld128.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-xop-ld128.c
@@ -20,7 +20,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld128(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x4c8__xop_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-xop-ld64.c b/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-xop-ld64.c
index f311e3c..09fde8b 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-xop-ld64.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-xop-ld64.c
@@ -20,7 +20,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld64(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x4c8__xop_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/3x8-minmax-gemmlowp-neon-mlal-lane-prfm.c b/src/qs8-igemm/gen/3x8-minmax-gemmlowp-neon-mlal-lane-prfm.c
index c00d092..e848075 100644
--- a/src/qs8-igemm/gen/3x8-minmax-gemmlowp-neon-mlal-lane-prfm.c
+++ b/src/qs8-igemm/gen/3x8-minmax-gemmlowp-neon-mlal-lane-prfm.c
@@ -15,7 +15,7 @@
 #include <xnnpack/igemm.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane_prfm(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8__neon_mlal_lane_prfm(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/3x8-minmax-gemmlowp-neon-mlal-lane.c b/src/qs8-igemm/gen/3x8-minmax-gemmlowp-neon-mlal-lane.c
index 311e806..01c7412 100644
--- a/src/qs8-igemm/gen/3x8-minmax-gemmlowp-neon-mlal-lane.c
+++ b/src/qs8-igemm/gen/3x8-minmax-gemmlowp-neon-mlal-lane.c
@@ -15,7 +15,7 @@
 #include <xnnpack/igemm.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8__neon_mlal_lane(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/3x8-minmax-gemmlowp-neon-mull-addw-dup.c b/src/qs8-igemm/gen/3x8-minmax-gemmlowp-neon-mull-addw-dup.c
index 5091523..7f2be2d 100644
--- a/src/qs8-igemm/gen/3x8-minmax-gemmlowp-neon-mull-addw-dup.c
+++ b/src/qs8-igemm/gen/3x8-minmax-gemmlowp-neon-mull-addw-dup.c
@@ -15,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_3x8__neon_mull_addw_dup(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8__neon_mull_addw_dup(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/3x8c16-minmax-gemmlowp-neon-mlal-padal.c b/src/qs8-igemm/gen/3x8c16-minmax-gemmlowp-neon-mlal-padal.c
index 4341423..a5b0286 100644
--- a/src/qs8-igemm/gen/3x8c16-minmax-gemmlowp-neon-mlal-padal.c
+++ b/src/qs8-igemm/gen/3x8c16-minmax-gemmlowp-neon-mlal-padal.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal_padal(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/3x8c2-minmax-gemmlowp-neon-mlal-padal-dup.c b/src/qs8-igemm/gen/3x8c2-minmax-gemmlowp-neon-mlal-padal-dup.c
index 69ab549..a4ce62b 100644
--- a/src/qs8-igemm/gen/3x8c2-minmax-gemmlowp-neon-mlal-padal-dup.c
+++ b/src/qs8-igemm/gen/3x8c2-minmax-gemmlowp-neon-mlal-padal-dup.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c2__neon_mlal_padal_dup(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/3x8c2-minmax-gemmlowp-neon-mull-padal-dup.c b/src/qs8-igemm/gen/3x8c2-minmax-gemmlowp-neon-mull-padal-dup.c
index 4734ce3..d551100 100644
--- a/src/qs8-igemm/gen/3x8c2-minmax-gemmlowp-neon-mull-padal-dup.c
+++ b/src/qs8-igemm/gen/3x8c2-minmax-gemmlowp-neon-mull-padal-dup.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c2__neon_mull_padal_dup(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/3x8c8-minmax-gemmlowp-avx2.c b/src/qs8-igemm/gen/3x8c8-minmax-gemmlowp-avx2.c
index 2055125..63bcf90 100644
--- a/src/qs8-igemm/gen/3x8c8-minmax-gemmlowp-avx2.c
+++ b/src/qs8-igemm/gen/3x8c8-minmax-gemmlowp-avx2.c
@@ -16,7 +16,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__avx2(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/3x8c8-minmax-gemmlowp-neon-mlal-padal.c b/src/qs8-igemm/gen/3x8c8-minmax-gemmlowp-neon-mlal-padal.c
index 35581ca..31e55a9 100644
--- a/src/qs8-igemm/gen/3x8c8-minmax-gemmlowp-neon-mlal-padal.c
+++ b/src/qs8-igemm/gen/3x8c8-minmax-gemmlowp-neon-mlal-padal.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal_padal(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/3x8c8-minmax-gemmlowp-neon-mull-padal.c b/src/qs8-igemm/gen/3x8c8-minmax-gemmlowp-neon-mull-padal.c
index fd579d7..8780486 100644
--- a/src/qs8-igemm/gen/3x8c8-minmax-gemmlowp-neon-mull-padal.c
+++ b/src/qs8-igemm/gen/3x8c8-minmax-gemmlowp-neon-mull-padal.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull_padal(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/4x16-minmax-gemmlowp-aarch64-neon-mlal-lane-cortex-a53.S b/src/qs8-igemm/gen/4x16-minmax-gemmlowp-aarch64-neon-mlal-lane-cortex-a53.S
index e84ace2..e29d581 100644
--- a/src/qs8-igemm/gen/4x16-minmax-gemmlowp-aarch64-neon-mlal-lane-cortex-a53.S
+++ b/src/qs8-igemm/gen/4x16-minmax-gemmlowp-aarch64-neon-mlal-lane-cortex-a53.S
@@ -9,7 +9,7 @@
 
 #include <xnnpack/assembly.h>
 
-# void xnn_qs8_igemm_minmax_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53(
+# void xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53(
 #     size_t mr,                 x0
 #     size_t nc,                 x1
 #     size_t kc,                 x2 / x0
@@ -39,7 +39,7 @@
 # unused  v8 v9 v10 v11 v12 v13 v14 v15
 # x8, x21 temp for Cortex-A53 loads
 
-BEGIN_FUNCTION xnn_qs8_igemm_minmax_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53
+BEGIN_FUNCTION xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53
 
         # Clamp C pointers
         CMP     x0, 2                   // if mr < 2
@@ -836,7 +836,7 @@
         LDP     x20, x21, [sp], 16
         RET
 
-END_FUNCTION xnn_qs8_igemm_minmax_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53
+END_FUNCTION xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53
 
 #ifdef __ELF__
 .section ".note.GNU-stack","",%progbits
diff --git a/src/qs8-igemm/gen/4x16-minmax-gemmlowp-aarch64-neon-mlal-lane-prfm-cortex-a53.S b/src/qs8-igemm/gen/4x16-minmax-gemmlowp-aarch64-neon-mlal-lane-prfm-cortex-a53.S
index ce91ed5..a8dd985 100644
--- a/src/qs8-igemm/gen/4x16-minmax-gemmlowp-aarch64-neon-mlal-lane-prfm-cortex-a53.S
+++ b/src/qs8-igemm/gen/4x16-minmax-gemmlowp-aarch64-neon-mlal-lane-prfm-cortex-a53.S
@@ -9,7 +9,7 @@
 
 #include <xnnpack/assembly.h>
 
-# void xnn_qs8_igemm_minmax_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53(
+# void xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53(
 #     size_t mr,                 x0
 #     size_t nc,                 x1
 #     size_t kc,                 x2 / x0
@@ -39,7 +39,7 @@
 # unused  v8 v9 v10 v11 v12 v13 v14 v15
 # x8, x21 temp for Cortex-A53 loads
 
-BEGIN_FUNCTION xnn_qs8_igemm_minmax_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53
+BEGIN_FUNCTION xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53
 
         # Clamp C pointers
         CMP     x0, 2                   // if mr < 2
@@ -842,7 +842,7 @@
         LDP     x20, x21, [sp], 16
         RET
 
-END_FUNCTION xnn_qs8_igemm_minmax_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53
+END_FUNCTION xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53
 
 #ifdef __ELF__
 .section ".note.GNU-stack","",%progbits
diff --git a/src/qs8-igemm/gen/4x16-minmax-gemmlowp-neon-mlal-lane-prfm.c b/src/qs8-igemm/gen/4x16-minmax-gemmlowp-neon-mlal-lane-prfm.c
index 0169f60..a86d671 100644
--- a/src/qs8-igemm/gen/4x16-minmax-gemmlowp-neon-mlal-lane-prfm.c
+++ b/src/qs8-igemm/gen/4x16-minmax-gemmlowp-neon-mlal-lane-prfm.c
@@ -15,7 +15,7 @@
 #include <xnnpack/igemm.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane_prfm(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16__neon_mlal_lane_prfm(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/4x16-minmax-gemmlowp-neon-mlal-lane.c b/src/qs8-igemm/gen/4x16-minmax-gemmlowp-neon-mlal-lane.c
index 3a48d1e..ef22d36 100644
--- a/src/qs8-igemm/gen/4x16-minmax-gemmlowp-neon-mlal-lane.c
+++ b/src/qs8-igemm/gen/4x16-minmax-gemmlowp-neon-mlal-lane.c
@@ -15,7 +15,7 @@
 #include <xnnpack/igemm.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16__neon_mlal_lane(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/4x16-minmax-gemmlowp-neon-mull-addw-dup.c b/src/qs8-igemm/gen/4x16-minmax-gemmlowp-neon-mull-addw-dup.c
index 304b2df..05aef6c 100644
--- a/src/qs8-igemm/gen/4x16-minmax-gemmlowp-neon-mull-addw-dup.c
+++ b/src/qs8-igemm/gen/4x16-minmax-gemmlowp-neon-mull-addw-dup.c
@@ -15,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16__neon_mull_addw_dup(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/4x16c16-minmax-gemmlowp-neon-mlal-padal.c b/src/qs8-igemm/gen/4x16c16-minmax-gemmlowp-neon-mlal-padal.c
index c8b084f..fe0baa2 100644
--- a/src/qs8-igemm/gen/4x16c16-minmax-gemmlowp-neon-mlal-padal.c
+++ b/src/qs8-igemm/gen/4x16c16-minmax-gemmlowp-neon-mlal-padal.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal_padal(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/4x16c2-minmax-gemmlowp-neon-mlal-padal-dup.c b/src/qs8-igemm/gen/4x16c2-minmax-gemmlowp-neon-mlal-padal-dup.c
index 17d2623..8429af6 100644
--- a/src/qs8-igemm/gen/4x16c2-minmax-gemmlowp-neon-mlal-padal-dup.c
+++ b/src/qs8-igemm/gen/4x16c2-minmax-gemmlowp-neon-mlal-padal-dup.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c2__neon_mlal_padal_dup(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/4x16c2-minmax-gemmlowp-neon-mull-padal-dup.c b/src/qs8-igemm/gen/4x16c2-minmax-gemmlowp-neon-mull-padal-dup.c
index d402f2f..8176e11 100644
--- a/src/qs8-igemm/gen/4x16c2-minmax-gemmlowp-neon-mull-padal-dup.c
+++ b/src/qs8-igemm/gen/4x16c2-minmax-gemmlowp-neon-mull-padal-dup.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c2__neon_mull_padal_dup(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/4x16c4-minmax-gemmlowp-neondot.c b/src/qs8-igemm/gen/4x16c4-minmax-gemmlowp-neondot.c
index 66f1317..dde4046 100644
--- a/src/qs8-igemm/gen/4x16c4-minmax-gemmlowp-neondot.c
+++ b/src/qs8-igemm/gen/4x16c4-minmax-gemmlowp-neondot.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__neondot(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/4x16c8-minmax-gemmlowp-avx512skx.c b/src/qs8-igemm/gen/4x16c8-minmax-gemmlowp-avx512skx.c
index 2752f79..4630194 100644
--- a/src/qs8-igemm/gen/4x16c8-minmax-gemmlowp-avx512skx.c
+++ b/src/qs8-igemm/gen/4x16c8-minmax-gemmlowp-avx512skx.c
@@ -16,7 +16,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__avx512skx(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/4x16c8-minmax-gemmlowp-neon-mlal-padal.c b/src/qs8-igemm/gen/4x16c8-minmax-gemmlowp-neon-mlal-padal.c
index b97ecf7..dace7e7 100644
--- a/src/qs8-igemm/gen/4x16c8-minmax-gemmlowp-neon-mlal-padal.c
+++ b/src/qs8-igemm/gen/4x16c8-minmax-gemmlowp-neon-mlal-padal.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal_padal(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/4x16c8-minmax-gemmlowp-neon-mull-padal.c b/src/qs8-igemm/gen/4x16c8-minmax-gemmlowp-neon-mull-padal.c
index 33a9ff1..3aa103b 100644
--- a/src/qs8-igemm/gen/4x16c8-minmax-gemmlowp-neon-mull-padal.c
+++ b/src/qs8-igemm/gen/4x16c8-minmax-gemmlowp-neon-mull-padal.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull_padal(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/4x2-minmax-gemmlowp-scalar.c b/src/qs8-igemm/gen/4x2-minmax-gemmlowp-scalar.c
index 4c2bdae..934799b 100644
--- a/src/qs8-igemm/gen/4x2-minmax-gemmlowp-scalar.c
+++ b/src/qs8-igemm/gen/4x2-minmax-gemmlowp-scalar.c
@@ -14,7 +14,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_4x2__scalar(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x2__scalar(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/4x4-minmax-gemmlowp-scalar.c b/src/qs8-igemm/gen/4x4-minmax-gemmlowp-scalar.c
index b22b281..f7be8f5 100644
--- a/src/qs8-igemm/gen/4x4-minmax-gemmlowp-scalar.c
+++ b/src/qs8-igemm/gen/4x4-minmax-gemmlowp-scalar.c
@@ -14,7 +14,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_4x4__scalar(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x4__scalar(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-avx-ld128.c b/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-avx-ld128.c
index e341603..137cf4e 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-avx-ld128.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-avx-ld128.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_4x4c2__avx_ld128(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x4c2__avx_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-avx-ld64.c b/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-avx-ld64.c
index 2d815c9..4d01d59 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-avx-ld64.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-avx-ld64.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_4x4c2__avx_ld64(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x4c2__avx_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-sse2-ld128.c b/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-sse2-ld128.c
index 3e86a97..ca39bac 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-sse2-ld128.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-sse2-ld128.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_4x4c2__sse2_ld128(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x4c2__sse2_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-sse2-ld64.c b/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-sse2-ld64.c
index bdb9caa..66ef038 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-sse2-ld64.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-sse2-ld64.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_4x4c2__sse2_ld64(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x4c2__sse2_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-sse41-ld128.c b/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-sse41-ld128.c
index 09e76c0..e2381ae 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-sse41-ld128.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-sse41-ld128.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x4c2__sse41_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-sse41-ld64.c b/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-sse41-ld64.c
index 66c499c..6b06911 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-sse41-ld64.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-sse41-ld64.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x4c2__sse41_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-ssse3-ld128.c b/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-ssse3-ld128.c
index d8559fe..e18f6ea 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-ssse3-ld128.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-ssse3-ld128.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_4x4c2__ssse3_ld128(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x4c2__ssse3_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-ssse3-ld64.c b/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-ssse3-ld64.c
index 2318f10..f4c7be1 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-ssse3-ld64.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-ssse3-ld64.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_4x4c2__ssse3_ld64(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x4c2__ssse3_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-xop-ld128.c b/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-xop-ld128.c
index 0ce5964..fea4ce0 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-xop-ld128.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-xop-ld128.c
@@ -20,7 +20,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x4c2__xop_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-xop-ld64.c b/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-xop-ld64.c
index b44fa57..b971803 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-xop-ld64.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-xop-ld64.c
@@ -20,7 +20,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x4c2__xop_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/4x8-minmax-gemmlowp-neon-mlal-lane-prfm.c b/src/qs8-igemm/gen/4x8-minmax-gemmlowp-neon-mlal-lane-prfm.c
index aa8ea4c..ccada47 100644
--- a/src/qs8-igemm/gen/4x8-minmax-gemmlowp-neon-mlal-lane-prfm.c
+++ b/src/qs8-igemm/gen/4x8-minmax-gemmlowp-neon-mlal-lane-prfm.c
@@ -15,7 +15,7 @@
 #include <xnnpack/igemm.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane_prfm(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8__neon_mlal_lane_prfm(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/4x8-minmax-gemmlowp-neon-mlal-lane.c b/src/qs8-igemm/gen/4x8-minmax-gemmlowp-neon-mlal-lane.c
index 75cf5d3..fbbc02e 100644
--- a/src/qs8-igemm/gen/4x8-minmax-gemmlowp-neon-mlal-lane.c
+++ b/src/qs8-igemm/gen/4x8-minmax-gemmlowp-neon-mlal-lane.c
@@ -15,7 +15,7 @@
 #include <xnnpack/igemm.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8__neon_mlal_lane(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/4x8-minmax-gemmlowp-neon-mull-addw-dup.c b/src/qs8-igemm/gen/4x8-minmax-gemmlowp-neon-mull-addw-dup.c
index abe1d12..06556f9 100644
--- a/src/qs8-igemm/gen/4x8-minmax-gemmlowp-neon-mull-addw-dup.c
+++ b/src/qs8-igemm/gen/4x8-minmax-gemmlowp-neon-mull-addw-dup.c
@@ -15,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_4x8__neon_mull_addw_dup(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8__neon_mull_addw_dup(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/4x8c16-minmax-gemmlowp-neon-mlal-padal.c b/src/qs8-igemm/gen/4x8c16-minmax-gemmlowp-neon-mlal-padal.c
index 807df44..2207ceb 100644
--- a/src/qs8-igemm/gen/4x8c16-minmax-gemmlowp-neon-mlal-padal.c
+++ b/src/qs8-igemm/gen/4x8c16-minmax-gemmlowp-neon-mlal-padal.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal_padal(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/4x8c2-minmax-gemmlowp-neon-mlal-padal-dup.c b/src/qs8-igemm/gen/4x8c2-minmax-gemmlowp-neon-mlal-padal-dup.c
index fe50d25..ca84aba 100644
--- a/src/qs8-igemm/gen/4x8c2-minmax-gemmlowp-neon-mlal-padal-dup.c
+++ b/src/qs8-igemm/gen/4x8c2-minmax-gemmlowp-neon-mlal-padal-dup.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c2__neon_mlal_padal_dup(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/4x8c2-minmax-gemmlowp-neon-mull-padal-dup.c b/src/qs8-igemm/gen/4x8c2-minmax-gemmlowp-neon-mull-padal-dup.c
index de6c5cb..9e5542d 100644
--- a/src/qs8-igemm/gen/4x8c2-minmax-gemmlowp-neon-mull-padal-dup.c
+++ b/src/qs8-igemm/gen/4x8c2-minmax-gemmlowp-neon-mull-padal-dup.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mull_padal_dup(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c2__neon_mull_padal_dup(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/4x8c4-minmax-gemmlowp-neondot.c b/src/qs8-igemm/gen/4x8c4-minmax-gemmlowp-neondot.c
index 8f4fae8..d3d7681 100644
--- a/src/qs8-igemm/gen/4x8c4-minmax-gemmlowp-neondot.c
+++ b/src/qs8-igemm/gen/4x8c4-minmax-gemmlowp-neondot.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_4x8c4__neondot(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c4__neondot(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/4x8c8-minmax-gemmlowp-neon-mlal-padal.c b/src/qs8-igemm/gen/4x8c8-minmax-gemmlowp-neon-mlal-padal.c
index 7328bbe..ada160f 100644
--- a/src/qs8-igemm/gen/4x8c8-minmax-gemmlowp-neon-mlal-padal.c
+++ b/src/qs8-igemm/gen/4x8c8-minmax-gemmlowp-neon-mlal-padal.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal_padal(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/4x8c8-minmax-gemmlowp-neon-mull-padal.c b/src/qs8-igemm/gen/4x8c8-minmax-gemmlowp-neon-mull-padal.c
index 26179ea..11e1792 100644
--- a/src/qs8-igemm/gen/4x8c8-minmax-gemmlowp-neon-mull-padal.c
+++ b/src/qs8-igemm/gen/4x8c8-minmax-gemmlowp-neon-mull-padal.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mull_padal(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull_padal(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/6x16-minmax-gemmlowp-neon-mlal-lane-prfm.c b/src/qs8-igemm/gen/6x16-minmax-gemmlowp-neon-mlal-lane-prfm.c
index 2529389..3bb2417 100644
--- a/src/qs8-igemm/gen/6x16-minmax-gemmlowp-neon-mlal-lane-prfm.c
+++ b/src/qs8-igemm/gen/6x16-minmax-gemmlowp-neon-mlal-lane-prfm.c
@@ -15,7 +15,7 @@
 #include <xnnpack/igemm.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_6x16__neon_mlal_lane_prfm(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_6x16__neon_mlal_lane_prfm(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/6x16-minmax-gemmlowp-neon-mlal-lane.c b/src/qs8-igemm/gen/6x16-minmax-gemmlowp-neon-mlal-lane.c
index 660b7aa..5af8751 100644
--- a/src/qs8-igemm/gen/6x16-minmax-gemmlowp-neon-mlal-lane.c
+++ b/src/qs8-igemm/gen/6x16-minmax-gemmlowp-neon-mlal-lane.c
@@ -15,7 +15,7 @@
 #include <xnnpack/igemm.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_6x16__neon_mlal_lane(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_6x16__neon_mlal_lane(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/6x16c4-minmax-gemmlowp-neondot.c b/src/qs8-igemm/gen/6x16c4-minmax-gemmlowp-neondot.c
index c887d8b..06ee29d 100644
--- a/src/qs8-igemm/gen/6x16c4-minmax-gemmlowp-neondot.c
+++ b/src/qs8-igemm/gen/6x16c4-minmax-gemmlowp-neondot.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_6x16c4__neondot(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/6x8-minmax-gemmlowp-neon-mlal-lane-prfm.c b/src/qs8-igemm/gen/6x8-minmax-gemmlowp-neon-mlal-lane-prfm.c
index 6418730..e3a6be1 100644
--- a/src/qs8-igemm/gen/6x8-minmax-gemmlowp-neon-mlal-lane-prfm.c
+++ b/src/qs8-igemm/gen/6x8-minmax-gemmlowp-neon-mlal-lane-prfm.c
@@ -15,7 +15,7 @@
 #include <xnnpack/igemm.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_6x8__neon_mlal_lane_prfm(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_6x8__neon_mlal_lane_prfm(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/6x8-minmax-gemmlowp-neon-mlal-lane.c b/src/qs8-igemm/gen/6x8-minmax-gemmlowp-neon-mlal-lane.c
index 5e5e698..9231e07 100644
--- a/src/qs8-igemm/gen/6x8-minmax-gemmlowp-neon-mlal-lane.c
+++ b/src/qs8-igemm/gen/6x8-minmax-gemmlowp-neon-mlal-lane.c
@@ -15,7 +15,7 @@
 #include <xnnpack/igemm.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_6x8__neon_mlal_lane(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_6x8__neon_mlal_lane(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/6x8c4-minmax-gemmlowp-neondot.c b/src/qs8-igemm/gen/6x8c4-minmax-gemmlowp-neondot.c
index ba44c4e..247fb20 100644
--- a/src/qs8-igemm/gen/6x8c4-minmax-gemmlowp-neondot.c
+++ b/src/qs8-igemm/gen/6x8c4-minmax-gemmlowp-neondot.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_6x8c4__neondot(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_6x8c4__neondot(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/8x16c4-minmax-gemmlowp-neondot.c b/src/qs8-igemm/gen/8x16c4-minmax-gemmlowp-neondot.c
index 3083482..191c27f 100644
--- a/src/qs8-igemm/gen/8x16c4-minmax-gemmlowp-neondot.c
+++ b/src/qs8-igemm/gen/8x16c4-minmax-gemmlowp-neondot.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_8x16c4__neondot(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/gen/8x8c4-minmax-gemmlowp-neondot.c b/src/qs8-igemm/gen/8x8c4-minmax-gemmlowp-neondot.c
index a00dd50..420a821 100644
--- a/src/qs8-igemm/gen/8x8c4-minmax-gemmlowp-neondot.c
+++ b/src/qs8-igemm/gen/8x8c4-minmax-gemmlowp-neondot.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_8x8c4__neondot(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_8x8c4__neondot(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/neon-mlal-lane.c.in b/src/qs8-igemm/neon-mlal-lane.c.in
index 214c431..5fcebaf 100644
--- a/src/qs8-igemm/neon-mlal-lane.c.in
+++ b/src/qs8-igemm/neon-mlal-lane.c.in
@@ -14,7 +14,7 @@
 #include <xnnpack/igemm.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_${MR}x${NR}__neon_mlal_lane${"_prfm" if PREFETCH else ""}(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_${MR}x${NR}__neon_mlal_lane${"_prfm" if PREFETCH else ""}(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/neon-mull-addw-dup.c.in b/src/qs8-igemm/neon-mull-addw-dup.c.in
index 45f3ff4..f229b60 100644
--- a/src/qs8-igemm/neon-mull-addw-dup.c.in
+++ b/src/qs8-igemm/neon-mull-addw-dup.c.in
@@ -14,7 +14,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_${MR}x${NR}__neon_mull_addw_dup(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_${MR}x${NR}__neon_mull_addw_dup(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/qs8-igemm/scalar.c.in b/src/qs8-igemm/scalar.c.in
index dd29263..71ad765 100644
--- a/src/qs8-igemm/scalar.c.in
+++ b/src/qs8-igemm/scalar.c.in
@@ -10,7 +10,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_qs8_igemm_minmax_ukernel_${MR}x${NR}__scalar(
+void xnn_qs8_igemm_minmax_gemmlowp_ukernel_${MR}x${NR}__scalar(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/xnnpack/dwconv.h b/src/xnnpack/dwconv.h
index 7d90e98..b478e31 100644
--- a/src/xnnpack/dwconv.h
+++ b/src/xnnpack/dwconv.h
@@ -290,109 +290,109 @@
     const int8_t* zero,                                             \
     const union xnn_qs8_gemm_params* params);
 
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up8x9__neon_mul16)
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up16x9__neon_mul16)
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up24x9__neon_mul16)
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up32x9__neon_mul16)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up8x9__neon_mul16)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up16x9__neon_mul16)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up24x9__neon_mul16)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up32x9__neon_mul16)
 
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up8x9__sse2_mul16)
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up16x9__sse2_mul16)
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up8x9__sse2_mul16)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up16x9__sse2_mul16)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up24x9__sse2_mul16)
 
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up8x9__ssse3_mul16)
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16)
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up24x9__ssse3_mul16)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up8x9__ssse3_mul16)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up16x9__ssse3_mul16)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up24x9__ssse3_mul16)
 
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul16)
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16)
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up8x9__sse41_mul16)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up16x9__sse41_mul16)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up24x9__sse41_mul16)
 
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx_mul16)
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx_mul16)
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx_mul16)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up8x9__avx_mul16)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up16x9__avx_mul16)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up24x9__avx_mul16)
 
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx2_mul16)
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up32x9__avx2_mul16)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up16x9__avx2_mul16)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up32x9__avx2_mul16)
 
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul32)
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul32)
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul32)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up8x9__sse41_mul32)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up16x9__sse41_mul32)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up24x9__sse41_mul32)
 
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx_mul32)
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx_mul32)
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx_mul32)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up8x9__avx_mul32)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up16x9__avx_mul32)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up24x9__avx_mul32)
 
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up8x9__xop_mul32)
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up16x9__xop_mul32)
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up24x9__xop_mul32)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up8x9__xop_mul32)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up16x9__xop_mul32)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up24x9__xop_mul32)
 
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx2_mul32)
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx2_mul32)
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx2_mul32)
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up32x9__avx2_mul32)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up8x9__avx2_mul32)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up16x9__avx2_mul32)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up24x9__avx2_mul32)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up32x9__avx2_mul32)
 
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx512skx_mul32)
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up32x9__avx512skx_mul32)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up16x9__avx512skx_mul32)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up32x9__avx512skx_mul32)
 
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up8x9__wasmsimd_mul16)
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up16x9__wasmsimd_mul16)
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up24x9__wasmsimd_mul16)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up8x9__wasmsimd_mul16)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up16x9__wasmsimd_mul16)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up24x9__wasmsimd_mul16)
 
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up1x9__scalar)
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up2x9__scalar)
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up4x9__scalar)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up1x9__scalar)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up2x9__scalar)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up4x9__scalar)
 
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up8x25__neon_mul16)
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up16x25__neon_mul16)
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up24x25__neon_mul16)
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up32x25__neon_mul16)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up8x25__neon_mul16)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up16x25__neon_mul16)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up24x25__neon_mul16)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up32x25__neon_mul16)
 
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up8x25__sse2_mul16)
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up16x25__sse2_mul16)
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up24x25__sse2_mul16)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up8x25__sse2_mul16)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up16x25__sse2_mul16)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up24x25__sse2_mul16)
 
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up8x25__ssse3_mul16)
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up16x25__ssse3_mul16)
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up24x25__ssse3_mul16)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up8x25__ssse3_mul16)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up16x25__ssse3_mul16)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up24x25__ssse3_mul16)
 
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up8x25__sse41_mul16)
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up16x25__sse41_mul16)
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up24x25__sse41_mul16)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up8x25__sse41_mul16)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up16x25__sse41_mul16)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up24x25__sse41_mul16)
 
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx_mul16)
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx_mul16)
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx_mul16)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up8x25__avx_mul16)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up16x25__avx_mul16)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up24x25__avx_mul16)
 
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx2_mul16)
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up32x25__avx2_mul16)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up16x25__avx2_mul16)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up32x25__avx2_mul16)
 
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up8x25__sse41_mul32)
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up16x25__sse41_mul32)
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up24x25__sse41_mul32)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up8x25__sse41_mul32)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up16x25__sse41_mul32)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up24x25__sse41_mul32)
 
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx_mul32)
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx_mul32)
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx_mul32)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up8x25__avx_mul32)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up16x25__avx_mul32)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up24x25__avx_mul32)
 
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up8x25__xop_mul32)
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up16x25__xop_mul32)
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up24x25__xop_mul32)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up8x25__xop_mul32)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up16x25__xop_mul32)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up24x25__xop_mul32)
 
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx2_mul32)
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx2_mul32)
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx2_mul32)
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up32x25__avx2_mul32)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up8x25__avx2_mul32)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up16x25__avx2_mul32)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up24x25__avx2_mul32)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up32x25__avx2_mul32)
 
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx512skx_mul32)
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up32x25__avx512skx_mul32)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up16x25__avx512skx_mul32)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up32x25__avx512skx_mul32)
 
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up8x25__wasmsimd_mul16)
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up16x25__wasmsimd_mul16)
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up24x25__wasmsimd_mul16)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up8x25__wasmsimd_mul16)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up16x25__wasmsimd_mul16)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up24x25__wasmsimd_mul16)
 
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up1x25__scalar)
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up2x25__scalar)
-DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_ukernel_up4x25__scalar)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up1x25__scalar)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up2x25__scalar)
+DECLARE_QS8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up4x25__scalar)
 
 
 #define DECLARE_F32_DWCONV2D_CHW_MINMAX_UKERNEL_FUNCTION(fn_name) \
diff --git a/src/xnnpack/gemm.h b/src/xnnpack/gemm.h
index cd81081..399c98d 100644
--- a/src/xnnpack/gemm.h
+++ b/src/xnnpack/gemm.h
@@ -518,236 +518,236 @@
       size_t cn_stride,                                   \
       const union xnn_qs8_gemm_params* params);
 
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_1x8__neon_mlal_lane)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_2x8__neon_mlal_lane)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_6x8__neon_mlal_lane)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_6x16__neon_mlal_lane)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8__neon_mlal_lane)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8__neon_mlal_lane)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8__neon_mlal_lane)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8__neon_mlal_lane)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_6x8__neon_mlal_lane)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16__neon_mlal_lane)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16__neon_mlal_lane)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16__neon_mlal_lane)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16__neon_mlal_lane)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_6x16__neon_mlal_lane)
 
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_1x8__neon_mlal_lane_prfm)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_2x8__neon_mlal_lane_prfm)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane_prfm)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane_prfm)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_6x8__neon_mlal_lane_prfm)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane_prfm)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane_prfm)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane_prfm)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane_prfm)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_6x16__neon_mlal_lane_prfm)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8__neon_mlal_lane_prfm)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8__neon_mlal_lane_prfm)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8__neon_mlal_lane_prfm)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8__neon_mlal_lane_prfm)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_6x8__neon_mlal_lane_prfm)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16__neon_mlal_lane_prfm)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16__neon_mlal_lane_prfm)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16__neon_mlal_lane_prfm)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16__neon_mlal_lane_prfm)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_6x16__neon_mlal_lane_prfm)
 
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_1x8__neon_mull_addw_dup)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_2x8__neon_mull_addw_dup)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_3x8__neon_mull_addw_dup)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_4x8__neon_mull_addw_dup)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8__neon_mull_addw_dup)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8__neon_mull_addw_dup)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8__neon_mull_addw_dup)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8__neon_mull_addw_dup)
 
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16__neon_mull_addw_dup)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16__neon_mull_addw_dup)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16__neon_mull_addw_dup)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16__neon_mull_addw_dup)
 
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mull_padal_dup)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mull_padal_dup)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mull_padal_dup)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mull_padal_dup)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c2__neon_mull_padal_dup)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c2__neon_mull_padal_dup)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c2__neon_mull_padal_dup)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c2__neon_mull_padal_dup)
 
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c2__neon_mull_padal_dup)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c2__neon_mull_padal_dup)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c2__neon_mull_padal_dup)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c2__neon_mull_padal_dup)
 
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_padal_dup)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_padal_dup)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c2__neon_mlal_padal_dup)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c2__neon_mlal_padal_dup)
 
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c2__neon_mlal_padal_dup)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c2__neon_mlal_padal_dup)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c2__neon_mlal_padal_dup)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c2__neon_mlal_padal_dup)
 
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mull_padal)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mull_padal)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mull_padal)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull_padal)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull_padal)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull_padal)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull_padal)
 
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull_padal)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull_padal)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull_padal)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull_padal)
 
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mlal_padal)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal_padal)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal_padal)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal_padal)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal_padal)
 
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal_padal)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal_padal)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal_padal)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal_padal)
 
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_1x8c16__neon_mlal_padal)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_2x8c16__neon_mlal_padal)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal_padal)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal_padal)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal_padal)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal_padal)
 
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal_padal)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal_padal)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal_padal)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal_padal)
 
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_1x8c4__neondot)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_4x8c4__neondot)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_6x8c4__neondot)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_8x8c4__neondot)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c4__neondot)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x8c4__neondot)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_6x8c4__neondot)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_8x8c4__neondot)
 
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_1x16c4__neondot)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__neondot)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__neondot)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_6x16c4__neondot)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_8x16c4__neondot)
 
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_1x8c8__aarch64_neon_mlal_padal)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mull_padal)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull_padal)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal)
 
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53)
 
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_1x16c4__aarch64_neondot)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_1x16c4__aarch64_neondot_ld32)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_1x16c4__aarch64_neondot_ld64)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_ld32)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_ld64)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_cortex_a55)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55)
 
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse2_ld64)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse2_ld64)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse2_ld64)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse2_ld64)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x4c2__sse2_ld64)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c2__sse2_ld64)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c2__sse2_ld64)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x4c2__sse2_ld64)
 
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_1x4c2__ssse3_ld64)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_2x4c2__ssse3_ld64)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_3x4c2__ssse3_ld64)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_4x4c2__ssse3_ld64)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x4c2__ssse3_ld64)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c2__ssse3_ld64)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c2__ssse3_ld64)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x4c2__ssse3_ld64)
 
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld64)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld64)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld64)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x4c2__sse41_ld64)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c2__sse41_ld64)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c2__sse41_ld64)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x4c2__sse41_ld64)
 
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld64)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld64)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld64)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld64)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x4c2__avx_ld64)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c2__avx_ld64)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c2__avx_ld64)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x4c2__avx_ld64)
 
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld64)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld64)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld64)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x4c2__xop_ld64)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c2__xop_ld64)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c2__xop_ld64)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x4c2__xop_ld64)
 
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse2_ld128)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse2_ld128)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse2_ld128)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse2_ld128)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x4c2__sse2_ld128)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c2__sse2_ld128)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c2__sse2_ld128)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x4c2__sse2_ld128)
 
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_1x4c2__ssse3_ld128)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_2x4c2__ssse3_ld128)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_3x4c2__ssse3_ld128)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_4x4c2__ssse3_ld128)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x4c2__ssse3_ld128)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c2__ssse3_ld128)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c2__ssse3_ld128)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x4c2__ssse3_ld128)
 
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld128)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld128)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld128)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x4c2__sse41_ld128)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c2__sse41_ld128)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c2__sse41_ld128)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x4c2__sse41_ld128)
 
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld128)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld128)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld128)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld128)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x4c2__avx_ld128)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c2__avx_ld128)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c2__avx_ld128)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x4c2__avx_ld128)
 
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld128)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld128)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld128)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x4c2__xop_ld128)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c2__xop_ld128)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c2__xop_ld128)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x4c2__xop_ld128)
 
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse2_ld64)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse2_ld64)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse2_ld64)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x4c8__sse2_ld64)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c8__sse2_ld64)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c8__sse2_ld64)
 
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_1x4c8__ssse3_ld64)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_2x4c8__ssse3_ld64)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_3x4c8__ssse3_ld64)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x4c8__ssse3_ld64)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c8__ssse3_ld64)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c8__ssse3_ld64)
 
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld64)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld64)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld64)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x4c8__sse41_ld64)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c8__sse41_ld64)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c8__sse41_ld64)
 
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld64)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld64)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld64)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x4c8__avx_ld64)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c8__avx_ld64)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c8__avx_ld64)
 
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld64)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld64)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld64)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x4c8__xop_ld64)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c8__xop_ld64)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c8__xop_ld64)
 
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse2_ld128)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse2_ld128)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse2_ld128)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x4c8__sse2_ld128)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c8__sse2_ld128)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c8__sse2_ld128)
 
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_1x4c8__ssse3_ld128)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_2x4c8__ssse3_ld128)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_3x4c8__ssse3_ld128)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x4c8__ssse3_ld128)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c8__ssse3_ld128)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c8__ssse3_ld128)
 
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld128)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld128)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x4c8__sse41_ld128)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c8__sse41_ld128)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c8__sse41_ld128)
 
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld128)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld128)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld128)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x4c8__avx_ld128)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c8__avx_ld128)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c8__avx_ld128)
 
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld128)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld128)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x4c8__xop_ld128)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c8__xop_ld128)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c8__xop_ld128)
 
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__avx2)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__avx2)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__avx2)
 
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_1x16c8__avx512skx)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_2x16c8__avx512skx)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_3x16c8__avx512skx)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__avx512skx)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x16c8__avx512skx)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x16c8__avx512skx)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c8__avx512skx)
 
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_1x4c8__wasmsimd_ld64)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_2x4c8__wasmsimd_ld64)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_3x4c8__wasmsimd_ld64)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x4c8__wasmsimd_ld64)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c8__wasmsimd_ld64)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c8__wasmsimd_ld64)
 
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_1x4c8__wasmsimd_ld128)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_2x4c8__wasmsimd_ld128)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_3x4c8__wasmsimd_ld128)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x4c8__wasmsimd_ld128)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c8__wasmsimd_ld128)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c8__wasmsimd_ld128)
 
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_1x2__scalar)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_2x2__scalar)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_3x2__scalar)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_4x2__scalar)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x2__scalar)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x2__scalar)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x2__scalar)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x2__scalar)
 
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_1x4__scalar)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_2x4__scalar)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_3x4__scalar)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_4x4__scalar)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x4__scalar)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4__scalar)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4__scalar)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x4__scalar)
 
 
 #define DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(fn_name) \
@@ -763,58 +763,58 @@
       size_t cn_stride,                                      \
       const union xnn_qs8_gemm_params* params);
 
-DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse2)
-DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_ukernel_2x4c2__sse2)
-DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_ukernel_3x4c2__sse2)
-DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse2)
+DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse2)
+DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c2__sse2)
+DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c2__sse2)
+DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse2)
 
-DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__ssse3)
-DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_ukernel_2x4c2__ssse3)
-DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_ukernel_3x4c2__ssse3)
-DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__ssse3)
+DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__ssse3)
+DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c2__ssse3)
+DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c2__ssse3)
+DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__ssse3)
 
-DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse41)
-DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_ukernel_2x4c2__sse41)
-DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_ukernel_3x4c2__sse41)
-DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41)
+DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse41)
+DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c2__sse41)
+DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c2__sse41)
+DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse41)
 
-DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__avx)
-DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_ukernel_2x4c2__avx)
-DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_ukernel_3x4c2__avx)
-DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__avx)
+DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__avx)
+DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c2__avx)
+DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c2__avx)
+DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__avx)
 
-DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__xop)
-DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_ukernel_2x4c2__xop)
-DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_ukernel_3x4c2__xop)
-DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop)
+DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__xop)
+DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c2__xop)
+DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c2__xop)
+DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__xop)
 
-DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse2)
-DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__sse2)
-DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__sse2)
+DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse2)
+DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse2)
+DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse2)
 
-DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__ssse3)
-DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__ssse3)
-DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__ssse3)
+DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__ssse3)
+DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__ssse3)
+DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__ssse3)
 
-DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse41)
-DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__sse41)
-DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__sse41)
+DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse41)
+DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse41)
+DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse41)
 
-DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__avx)
-DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__avx)
-DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__avx)
+DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__avx)
+DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__avx)
+DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__avx)
 
-DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__xop)
-DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__xop)
-DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__xop)
+DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__xop)
+DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__xop)
+DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__xop)
 
-DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2)
-DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2)
-DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2)
+DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x8c8__avx2)
+DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x8c8__avx2)
+DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x8c8__avx2)
 
-DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__wasmsimd)
-DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__wasmsimd)
-DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__wasmsimd)
+DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__wasmsimd)
+DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__wasmsimd)
+DECLARE_QS8_GEMM_XW_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__wasmsimd)
 
 
 #ifdef __cplusplus
diff --git a/src/xnnpack/igemm.h b/src/xnnpack/igemm.h
index 08350e3..a7e34bd 100644
--- a/src/xnnpack/igemm.h
+++ b/src/xnnpack/igemm.h
@@ -327,230 +327,230 @@
       const int8_t* zero,                                  \
       const union xnn_qs8_gemm_params* params);
 
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_1x8__neon_mlal_lane)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_2x8__neon_mlal_lane)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_6x8__neon_mlal_lane)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_1x16__neon_mlal_lane)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_6x16__neon_mlal_lane)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8__neon_mlal_lane)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8__neon_mlal_lane)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8__neon_mlal_lane)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8__neon_mlal_lane)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_6x8__neon_mlal_lane)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16__neon_mlal_lane)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16__neon_mlal_lane)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16__neon_mlal_lane)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16__neon_mlal_lane)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_6x16__neon_mlal_lane)
 
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_1x8__neon_mlal_lane_prfm)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_2x8__neon_mlal_lane_prfm)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane_prfm)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane_prfm)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_6x8__neon_mlal_lane_prfm)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_1x16__neon_mlal_lane_prfm)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane_prfm)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane_prfm)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane_prfm)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_6x16__neon_mlal_lane_prfm)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8__neon_mlal_lane_prfm)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8__neon_mlal_lane_prfm)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8__neon_mlal_lane_prfm)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8__neon_mlal_lane_prfm)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_6x8__neon_mlal_lane_prfm)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16__neon_mlal_lane_prfm)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16__neon_mlal_lane_prfm)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16__neon_mlal_lane_prfm)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16__neon_mlal_lane_prfm)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_6x16__neon_mlal_lane_prfm)
 
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_1x8__neon_mull_addw_dup)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_2x8__neon_mull_addw_dup)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_3x8__neon_mull_addw_dup)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_4x8__neon_mull_addw_dup)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8__neon_mull_addw_dup)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8__neon_mull_addw_dup)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8__neon_mull_addw_dup)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8__neon_mull_addw_dup)
 
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16__neon_mull_addw_dup)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16__neon_mull_addw_dup)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16__neon_mull_addw_dup)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16__neon_mull_addw_dup)
 
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mull_padal)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mull_padal)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mull_padal)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull_padal)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mull_padal)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mull_padal)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mull_padal)
 
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mull_padal)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mull_padal)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mull_padal)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mull_padal)
 
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mlal_padal)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__neon_mlal_padal)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__neon_mlal_padal)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal_padal)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal_padal)
 
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal_padal)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal_padal)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal_padal)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__neon_mlal_padal)
 
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_1x8c16__neon_mlal_padal)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_2x8c16__neon_mlal_padal)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c16__neon_mlal_padal)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__neon_mlal_padal)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c16__neon_mlal_padal)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c16__neon_mlal_padal)
 
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c16__neon_mlal_padal)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c16__neon_mlal_padal)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c16__neon_mlal_padal)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c16__neon_mlal_padal)
 
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mull_padal_dup)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mull_padal_dup)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mull_padal_dup)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c2__neon_mull_padal_dup)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c2__neon_mull_padal_dup)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c2__neon_mull_padal_dup)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c2__neon_mull_padal_dup)
 
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c2__neon_mull_padal_dup)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c2__neon_mull_padal_dup)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c2__neon_mull_padal_dup)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c2__neon_mull_padal_dup)
 
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c2__neon_mlal_padal_dup)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c2__neon_mlal_padal_dup)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c2__neon_mlal_padal_dup)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c2__neon_mlal_padal_dup)
 
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c2__neon_mlal_padal_dup)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c2__neon_mlal_padal_dup)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c2__neon_mlal_padal_dup)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c2__neon_mlal_padal_dup)
 
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_1x8c4__neondot)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_4x8c4__neondot)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_6x8c4__neondot)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_8x8c4__neondot)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c4__neondot)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c4__neondot)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_6x8c4__neondot)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_8x8c4__neondot)
 
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c4__neondot)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__neondot)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_6x16c4__neondot)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_8x16c4__neondot)
 
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_1x8c8__aarch64_neon_mlal_padal)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_2x8c16__aarch64_neon_mlal_padal)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal)
 
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53)
 
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_ld64)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_cortex_a55)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55)
 
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse2_ld64)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse2_ld64)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse2_ld64)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse2_ld64)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x4c2__sse2_ld64)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x4c2__sse2_ld64)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x4c2__sse2_ld64)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x4c2__sse2_ld64)
 
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_1x4c2__ssse3_ld64)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_2x4c2__ssse3_ld64)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_3x4c2__ssse3_ld64)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_4x4c2__ssse3_ld64)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x4c2__ssse3_ld64)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x4c2__ssse3_ld64)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x4c2__ssse3_ld64)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x4c2__ssse3_ld64)
 
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld64)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld64)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld64)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x4c2__sse41_ld64)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x4c2__sse41_ld64)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x4c2__sse41_ld64)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x4c2__sse41_ld64)
 
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_1x4c2__avx_ld64)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_2x4c2__avx_ld64)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_3x4c2__avx_ld64)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_4x4c2__avx_ld64)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x4c2__avx_ld64)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x4c2__avx_ld64)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x4c2__avx_ld64)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x4c2__avx_ld64)
 
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld64)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld64)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld64)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x4c2__xop_ld64)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x4c2__xop_ld64)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x4c2__xop_ld64)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x4c2__xop_ld64)
 
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse2_ld128)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse2_ld128)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse2_ld128)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse2_ld128)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x4c2__sse2_ld128)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x4c2__sse2_ld128)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x4c2__sse2_ld128)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x4c2__sse2_ld128)
 
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_1x4c2__ssse3_ld128)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_2x4c2__ssse3_ld128)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_3x4c2__ssse3_ld128)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_4x4c2__ssse3_ld128)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x4c2__ssse3_ld128)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x4c2__ssse3_ld128)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x4c2__ssse3_ld128)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x4c2__ssse3_ld128)
 
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld128)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld128)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld128)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x4c2__sse41_ld128)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x4c2__sse41_ld128)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x4c2__sse41_ld128)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x4c2__sse41_ld128)
 
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_1x4c2__avx_ld128)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_2x4c2__avx_ld128)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_3x4c2__avx_ld128)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_4x4c2__avx_ld128)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x4c2__avx_ld128)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x4c2__avx_ld128)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x4c2__avx_ld128)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x4c2__avx_ld128)
 
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld128)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld128)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld128)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x4c2__xop_ld128)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x4c2__xop_ld128)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x4c2__xop_ld128)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x4c2__xop_ld128)
 
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse2_ld64)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse2_ld64)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse2_ld64)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x4c8__sse2_ld64)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x4c8__sse2_ld64)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x4c8__sse2_ld64)
 
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_1x4c8__ssse3_ld64)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_2x4c8__ssse3_ld64)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_3x4c8__ssse3_ld64)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x4c8__ssse3_ld64)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x4c8__ssse3_ld64)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x4c8__ssse3_ld64)
 
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld64)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld64)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld64)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x4c8__sse41_ld64)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x4c8__sse41_ld64)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x4c8__sse41_ld64)
 
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld64)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld64)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld64)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x4c8__avx_ld64)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x4c8__avx_ld64)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x4c8__avx_ld64)
 
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld64)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld64)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld64)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x4c8__xop_ld64)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x4c8__xop_ld64)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x4c8__xop_ld64)
 
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse2_ld128)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse2_ld128)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse2_ld128)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x4c8__sse2_ld128)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x4c8__sse2_ld128)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x4c8__sse2_ld128)
 
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_1x4c8__ssse3_ld128)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_2x4c8__ssse3_ld128)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_3x4c8__ssse3_ld128)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x4c8__ssse3_ld128)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x4c8__ssse3_ld128)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x4c8__ssse3_ld128)
 
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld128)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld128)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld128)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x4c8__sse41_ld128)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x4c8__sse41_ld128)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x4c8__sse41_ld128)
 
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld128)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld128)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld128)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x4c8__avx_ld128)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x4c8__avx_ld128)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x4c8__avx_ld128)
 
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld128)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld128)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x4c8__xop_ld128)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x4c8__xop_ld128)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x4c8__xop_ld128)
 
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__avx2)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__avx2)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__avx2)
 
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_1x16c8__avx512skx)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_2x16c8__avx512skx)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_3x16c8__avx512skx)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__avx512skx)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__avx512skx)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__avx512skx)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c8__avx512skx)
 
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_1x4c8__wasmsimd_ld64)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_2x4c8__wasmsimd_ld64)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_3x4c8__wasmsimd_ld64)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x4c8__wasmsimd_ld64)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x4c8__wasmsimd_ld64)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x4c8__wasmsimd_ld64)
 
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_1x4c8__wasmsimd_ld128)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_2x4c8__wasmsimd_ld128)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_3x4c8__wasmsimd_ld128)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x4c8__wasmsimd_ld128)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x4c8__wasmsimd_ld128)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x4c8__wasmsimd_ld128)
 
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_1x2__scalar)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_2x2__scalar)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_3x2__scalar)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_4x2__scalar)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x2__scalar)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x2__scalar)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x2__scalar)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x2__scalar)
 
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_1x4__scalar)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_2x4__scalar)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_3x4__scalar)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_4x4__scalar)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x4__scalar)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x4__scalar)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x4__scalar)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x4__scalar)
 
 
 #ifdef __cplusplus