Use specialized layouts in SSE4/AVX2 QS8 [I]GEMM & DWCONV microkernels

PiperOrigin-RevId: 375310512
diff --git a/bench/qs8-gemm.cc b/bench/qs8-gemm.cc
index 7be7571..0bf89b9 100644
--- a/bench/qs8-gemm.cc
+++ b/bench/qs8-gemm.cc
@@ -689,231 +689,231 @@
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
   static void qs8_gemm_2x16c8__avx512skx(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_2x16c8__avx512skx, 2, 16, 8, 1,
-      xnn_init_qs8_gemm_sse2_params, benchmark::utils::CheckAVX512SKX);
+      xnn_init_qs8_gemm_sse4_params, benchmark::utils::CheckAVX512SKX);
   }
   static void qs8_gemm_3x16c8__avx512skx(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_3x16c8__avx512skx, 3, 16, 8, 1,
-      xnn_init_qs8_gemm_sse2_params, benchmark::utils::CheckAVX512SKX);
+      xnn_init_qs8_gemm_sse4_params, benchmark::utils::CheckAVX512SKX);
   }
   static void qs8_gemm_4x16c8__avx512skx(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx, 4, 16, 8, 1,
-      xnn_init_qs8_gemm_sse2_params, benchmark::utils::CheckAVX512SKX);
+      xnn_init_qs8_gemm_sse4_params, benchmark::utils::CheckAVX512SKX);
   }
 
   static void qs8_gemm_2x8c8__avx2(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2, 2, 8, 8, 1,
-      xnn_init_qs8_gemm_sse2_params, benchmark::utils::CheckAVX2);
+      xnn_init_qs8_gemm_avx2_params, benchmark::utils::CheckAVX2);
   }
   static void qs8_gemm_3x8c8__avx2(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2, 3, 8, 8, 1,
-      xnn_init_qs8_gemm_sse2_params, benchmark::utils::CheckAVX2);
+      xnn_init_qs8_gemm_avx2_params, benchmark::utils::CheckAVX2);
   }
 
   static void qs8_gemm_xw_2x8c8__avx2(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2, 2, 8, 8, 1,
-      xnn_init_qs8_gemm_sse2_params, benchmark::utils::CheckAVX2, true);
+      xnn_init_qs8_gemm_avx2_params, benchmark::utils::CheckAVX2, true);
   }
   static void qs8_gemm_xw_3x8c8__avx2(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2, 3, 8, 8, 1,
-      xnn_init_qs8_gemm_sse2_params, benchmark::utils::CheckAVX2, true);
+      xnn_init_qs8_gemm_avx2_params, benchmark::utils::CheckAVX2, true);
   }
 
   static void qs8_gemm_2x4c2__xop_ld64(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld64, 2, 4, 2, 1,
-      xnn_init_qs8_gemm_sse2_params, benchmark::utils::CheckXOP);
+      xnn_init_qs8_gemm_sse4_params, benchmark::utils::CheckXOP);
   }
   static void qs8_gemm_3x4c2__xop_ld64(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld64, 3, 4, 2, 1,
-      xnn_init_qs8_gemm_sse2_params, benchmark::utils::CheckXOP);
+      xnn_init_qs8_gemm_sse4_params, benchmark::utils::CheckXOP);
   }
   static void qs8_gemm_4x4c2__xop_ld64(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64, 4, 4, 2, 1,
-      xnn_init_qs8_gemm_sse2_params, benchmark::utils::CheckXOP);
+      xnn_init_qs8_gemm_sse4_params, benchmark::utils::CheckXOP);
   }
 
   static void qs8_gemm_2x4c2__xop_ld128(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld128, 2, 4, 2, 1,
-      xnn_init_qs8_gemm_sse2_params, benchmark::utils::CheckXOP);
+      xnn_init_qs8_gemm_sse4_params, benchmark::utils::CheckXOP);
   }
   static void qs8_gemm_3x4c2__xop_ld128(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld128, 3, 4, 2, 1,
-      xnn_init_qs8_gemm_sse2_params, benchmark::utils::CheckXOP);
+      xnn_init_qs8_gemm_sse4_params, benchmark::utils::CheckXOP);
   }
   static void qs8_gemm_4x4c2__xop_ld128(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128, 4, 4, 2, 1,
-      xnn_init_qs8_gemm_sse2_params, benchmark::utils::CheckXOP);
+      xnn_init_qs8_gemm_sse4_params, benchmark::utils::CheckXOP);
   }
 
   static void qs8_gemm_xw_2x4c2__xop(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_ukernel_2x4c2__xop, 2, 4, 2, 1,
-      xnn_init_qs8_gemm_sse2_params, benchmark::utils::CheckXOP, true);
+      xnn_init_qs8_gemm_sse4_params, benchmark::utils::CheckXOP, true);
   }
   static void qs8_gemm_xw_3x4c2__xop(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_ukernel_3x4c2__xop, 3, 4, 2, 1,
-      xnn_init_qs8_gemm_sse2_params, benchmark::utils::CheckXOP, true);
+      xnn_init_qs8_gemm_sse4_params, benchmark::utils::CheckXOP, true);
   }
   static void qs8_gemm_xw_4x4c2__xop(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop, 4, 4, 2, 1,
-      xnn_init_qs8_gemm_sse2_params, benchmark::utils::CheckXOP, true);
+      xnn_init_qs8_gemm_sse4_params, benchmark::utils::CheckXOP, true);
   }
 
   static void qs8_gemm_2x4c8__xop_ld64(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld64, 2, 4, 8, 1,
-      xnn_init_qs8_gemm_sse2_params, benchmark::utils::CheckXOP);
+      xnn_init_qs8_gemm_sse4_params, benchmark::utils::CheckXOP);
   }
   static void qs8_gemm_3x4c8__xop_ld64(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld64, 3, 4, 8, 1,
-      xnn_init_qs8_gemm_sse2_params, benchmark::utils::CheckXOP);
+      xnn_init_qs8_gemm_sse4_params, benchmark::utils::CheckXOP);
   }
 
   static void qs8_gemm_2x4c8__xop_ld128(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld128, 2, 4, 8, 1,
-      xnn_init_qs8_gemm_sse2_params, benchmark::utils::CheckXOP);
+      xnn_init_qs8_gemm_sse4_params, benchmark::utils::CheckXOP);
   }
   static void qs8_gemm_3x4c8__xop_ld128(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld128, 3, 4, 8, 1,
-      xnn_init_qs8_gemm_sse2_params, benchmark::utils::CheckXOP);
+      xnn_init_qs8_gemm_sse4_params, benchmark::utils::CheckXOP);
   }
 
   static void qs8_gemm_xw_2x4c8__xop(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__xop, 2, 4, 8, 1,
-      xnn_init_qs8_gemm_sse2_params, benchmark::utils::CheckXOP, true);
+      xnn_init_qs8_gemm_sse4_params, benchmark::utils::CheckXOP, true);
   }
   static void qs8_gemm_xw_3x4c8__xop(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__xop, 3, 4, 8, 1,
-      xnn_init_qs8_gemm_sse2_params, benchmark::utils::CheckXOP, true);
+      xnn_init_qs8_gemm_sse4_params, benchmark::utils::CheckXOP, true);
   }
 
   static void qs8_gemm_2x4c2__avx_ld64(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld64, 2, 4, 2, 1,
-      xnn_init_qs8_gemm_sse2_params, benchmark::utils::CheckAVX);
+      xnn_init_qs8_gemm_sse4_params, benchmark::utils::CheckAVX);
   }
   static void qs8_gemm_3x4c2__avx_ld64(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld64, 3, 4, 2, 1,
-      xnn_init_qs8_gemm_sse2_params, benchmark::utils::CheckAVX);
+      xnn_init_qs8_gemm_sse4_params, benchmark::utils::CheckAVX);
   }
   static void qs8_gemm_4x4c2__avx_ld64(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld64, 4, 4, 2, 1,
-      xnn_init_qs8_gemm_sse2_params, benchmark::utils::CheckAVX);
+      xnn_init_qs8_gemm_sse4_params, benchmark::utils::CheckAVX);
   }
 
   static void qs8_gemm_2x4c2__avx_ld128(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld128, 2, 4, 2, 1,
-      xnn_init_qs8_gemm_sse2_params, benchmark::utils::CheckAVX);
+      xnn_init_qs8_gemm_sse4_params, benchmark::utils::CheckAVX);
   }
   static void qs8_gemm_3x4c2__avx_ld128(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld128, 3, 4, 2, 1,
-      xnn_init_qs8_gemm_sse2_params, benchmark::utils::CheckAVX);
+      xnn_init_qs8_gemm_sse4_params, benchmark::utils::CheckAVX);
   }
   static void qs8_gemm_4x4c2__avx_ld128(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld128, 4, 4, 2, 1,
-      xnn_init_qs8_gemm_sse2_params, benchmark::utils::CheckAVX);
+      xnn_init_qs8_gemm_sse4_params, benchmark::utils::CheckAVX);
   }
 
   static void qs8_gemm_xw_2x4c2__avx(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_ukernel_2x4c2__avx, 2, 4, 2, 1,
-      xnn_init_qs8_gemm_sse2_params, benchmark::utils::CheckAVX, true);
+      xnn_init_qs8_gemm_sse4_params, benchmark::utils::CheckAVX, true);
   }
   static void qs8_gemm_xw_3x4c2__avx(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_ukernel_3x4c2__avx, 3, 4, 2, 1,
-      xnn_init_qs8_gemm_sse2_params, benchmark::utils::CheckAVX, true);
+      xnn_init_qs8_gemm_sse4_params, benchmark::utils::CheckAVX, true);
   }
   static void qs8_gemm_xw_4x4c2__avx(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__avx, 4, 4, 2, 1,
-      xnn_init_qs8_gemm_sse2_params, benchmark::utils::CheckAVX, true);
+      xnn_init_qs8_gemm_sse4_params, benchmark::utils::CheckAVX, true);
   }
 
   static void qs8_gemm_2x4c8__avx_ld64(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld64, 2, 4, 8, 1,
-      xnn_init_qs8_gemm_sse2_params, benchmark::utils::CheckAVX);
+      xnn_init_qs8_gemm_sse4_params, benchmark::utils::CheckAVX);
   }
   static void qs8_gemm_3x4c8__avx_ld64(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld64, 3, 4, 8, 1,
-      xnn_init_qs8_gemm_sse2_params, benchmark::utils::CheckAVX);
+      xnn_init_qs8_gemm_sse4_params, benchmark::utils::CheckAVX);
   }
 
   static void qs8_gemm_2x4c8__avx_ld128(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld128, 2, 4, 8, 1,
-      xnn_init_qs8_gemm_sse2_params, benchmark::utils::CheckAVX);
+      xnn_init_qs8_gemm_sse4_params, benchmark::utils::CheckAVX);
   }
   static void qs8_gemm_3x4c8__avx_ld128(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld128, 3, 4, 8, 1,
-      xnn_init_qs8_gemm_sse2_params, benchmark::utils::CheckAVX);
+      xnn_init_qs8_gemm_sse4_params, benchmark::utils::CheckAVX);
   }
 
   static void qs8_gemm_xw_2x4c8__avx(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__avx, 2, 4, 8, 1,
-      xnn_init_qs8_gemm_sse2_params, benchmark::utils::CheckAVX, true);
+      xnn_init_qs8_gemm_sse4_params, benchmark::utils::CheckAVX, true);
   }
   static void qs8_gemm_xw_3x4c8__avx(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__avx, 3, 4, 8, 1,
-      xnn_init_qs8_gemm_sse2_params, benchmark::utils::CheckAVX, true);
+      xnn_init_qs8_gemm_sse4_params, benchmark::utils::CheckAVX, true);
   }
 
   static void qs8_gemm_2x4c2__sse41_ld64(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld64, 2, 4, 2, 1,
-      xnn_init_qs8_gemm_sse2_params, benchmark::utils::CheckSSE41);
+      xnn_init_qs8_gemm_sse4_params, benchmark::utils::CheckSSE41);
   }
   static void qs8_gemm_3x4c2__sse41_ld64(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld64, 3, 4, 2, 1,
-      xnn_init_qs8_gemm_sse2_params, benchmark::utils::CheckSSE41);
+      xnn_init_qs8_gemm_sse4_params, benchmark::utils::CheckSSE41);
   }
   static void qs8_gemm_4x4c2__sse41_ld64(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64, 4, 4, 2, 1,
-      xnn_init_qs8_gemm_sse2_params, benchmark::utils::CheckSSE41);
+      xnn_init_qs8_gemm_sse4_params, benchmark::utils::CheckSSE41);
   }
 
   static void qs8_gemm_2x4c2__sse41_ld128(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld128, 2, 4, 2, 1,
-      xnn_init_qs8_gemm_sse2_params, benchmark::utils::CheckSSE41);
+      xnn_init_qs8_gemm_sse4_params, benchmark::utils::CheckSSE41);
   }
   static void qs8_gemm_3x4c2__sse41_ld128(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld128, 3, 4, 2, 1,
-      xnn_init_qs8_gemm_sse2_params, benchmark::utils::CheckSSE41);
+      xnn_init_qs8_gemm_sse4_params, benchmark::utils::CheckSSE41);
   }
   static void qs8_gemm_4x4c2__sse41_ld128(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128, 4, 4, 2, 1,
-      xnn_init_qs8_gemm_sse2_params, benchmark::utils::CheckSSE41);
+      xnn_init_qs8_gemm_sse4_params, benchmark::utils::CheckSSE41);
   }
 
   static void qs8_gemm_xw_2x4c2__sse41(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_ukernel_2x4c2__sse41, 2, 4, 2, 1,
-      xnn_init_qs8_gemm_sse2_params, benchmark::utils::CheckSSE41, true);
+      xnn_init_qs8_gemm_sse4_params, benchmark::utils::CheckSSE41, true);
   }
   static void qs8_gemm_xw_3x4c2__sse41(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_ukernel_3x4c2__sse41, 3, 4, 2, 1,
-      xnn_init_qs8_gemm_sse2_params, benchmark::utils::CheckSSE41, true);
+      xnn_init_qs8_gemm_sse4_params, benchmark::utils::CheckSSE41, true);
   }
   static void qs8_gemm_xw_4x4c2__sse41(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41, 4, 4, 2, 1,
-      xnn_init_qs8_gemm_sse2_params, benchmark::utils::CheckSSE41, true);
+      xnn_init_qs8_gemm_sse4_params, benchmark::utils::CheckSSE41, true);
   }
 
   static void qs8_gemm_2x4c8__sse41_ld64(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld64, 2, 4, 8, 1,
-      xnn_init_qs8_gemm_sse2_params, benchmark::utils::CheckSSE41);
+      xnn_init_qs8_gemm_sse4_params, benchmark::utils::CheckSSE41);
   }
   static void qs8_gemm_3x4c8__sse41_ld64(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld64, 3, 4, 8, 1,
-      xnn_init_qs8_gemm_sse2_params, benchmark::utils::CheckSSE41);
+      xnn_init_qs8_gemm_sse4_params, benchmark::utils::CheckSSE41);
   }
 
   static void qs8_gemm_2x4c8__sse41_ld128(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld128, 2, 4, 8, 1,
-      xnn_init_qs8_gemm_sse2_params, benchmark::utils::CheckSSE41);
+      xnn_init_qs8_gemm_sse4_params, benchmark::utils::CheckSSE41);
   }
   static void qs8_gemm_3x4c8__sse41_ld128(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld128, 3, 4, 8, 1,
-      xnn_init_qs8_gemm_sse2_params, benchmark::utils::CheckSSE41);
+      xnn_init_qs8_gemm_sse4_params, benchmark::utils::CheckSSE41);
   }
 
   static void qs8_gemm_xw_2x4c8__sse41(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__sse41, 2, 4, 8, 1,
-      xnn_init_qs8_gemm_sse2_params, benchmark::utils::CheckSSE41, true);
+      xnn_init_qs8_gemm_sse4_params, benchmark::utils::CheckSSE41, true);
   }
   static void qs8_gemm_xw_3x4c8__sse41(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__sse41, 3, 4, 8, 1,
-      xnn_init_qs8_gemm_sse2_params, benchmark::utils::CheckSSE41, true);
+      xnn_init_qs8_gemm_sse4_params, benchmark::utils::CheckSSE41, true);
   }
 
   static void qs8_gemm_2x4c2__ssse3_ld64(benchmark::State& state, const char* net) {
diff --git a/src/init.c b/src/init.c
index 489772f..5aa36d9 100644
--- a/src/init.c
+++ b/src/init.c
@@ -1560,7 +1560,7 @@
       xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx);
       xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x16c8__avx512skx);
       xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x16c8__avx512skx);
-      xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_gemm_sse2_params;
+      xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_gemm_sse4_params;
       xnn_params.qs8.gemm.mr = 4;
       xnn_params.qs8.gemm.nr = 16;
       xnn_params.qs8.gemm.log2_kr = 3;
@@ -1570,7 +1570,7 @@
       xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld64);
       xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld64);
       xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld64);
-      xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_gemm_sse2_params;
+      xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_gemm_sse4_params;
       xnn_params.qs8.gemm.mr = 2;
       xnn_params.qs8.gemm.nr = 4;
       xnn_params.qs8.gemm.log2_kr = 3;
@@ -1579,7 +1579,7 @@
       xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2);
       xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2);
       xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2);
-      xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_gemm_sse2_params;
+      xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_gemm_avx2_params;
       xnn_params.qs8.gemm.mr = 3;
       xnn_params.qs8.gemm.nr = 8;
       xnn_params.qs8.gemm.log2_kr = 3;
@@ -1588,7 +1588,7 @@
       xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld128);
       xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld128);
       xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld128);
-      xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_gemm_sse2_params;
+      xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_gemm_sse4_params;
       xnn_params.qs8.gemm.mr = 2;
       xnn_params.qs8.gemm.nr = 4;
       xnn_params.qs8.gemm.log2_kr = 3;
@@ -1597,7 +1597,7 @@
       xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld64);
       xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld64);
       xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld64);
-      xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_gemm_sse2_params;
+      xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_gemm_sse4_params;
       xnn_params.qs8.gemm.mr = 3;
       xnn_params.qs8.gemm.nr = 4;
       xnn_params.qs8.gemm.log2_kr = 3;
@@ -1631,31 +1631,31 @@
     } else if (cpuinfo_has_x86_xop()) {
       // XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
       xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_ukernel_up16x9__xop_mul32;
-      xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_gemm_sse2_params;
+      xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_gemm_sse4_params;
       xnn_params.qs8.dwconv[0].channel_tile = 16;
       xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_ukernel_up16x25__xop_mul32;
-      xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_gemm_sse2_params;
+      xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_gemm_sse4_params;
       xnn_params.qs8.dwconv[1].channel_tile = 16;
     } else if (cpuinfo_has_x86_avx2()) {
       xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_ukernel_up16x9__avx2_mul32;
-      xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_gemm_sse2_params;
+      xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_gemm_avx2_params;
       xnn_params.qs8.dwconv[0].channel_tile = 16;
       xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_ukernel_up16x25__avx2_mul32;
-      xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_gemm_sse2_params;
+      xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_gemm_avx2_params;
       xnn_params.qs8.dwconv[1].channel_tile = 16;
     } else if (cpuinfo_has_x86_avx()) {
       xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_ukernel_up16x9__avx_mul32;
-      xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_gemm_sse2_params;
+      xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_gemm_sse4_params;
       xnn_params.qs8.dwconv[0].channel_tile = 16;
       xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_ukernel_up16x25__avx_mul32;
-      xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_gemm_sse2_params;
+      xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_gemm_sse4_params;
       xnn_params.qs8.dwconv[1].channel_tile = 16;
     } else if (cpuinfo_has_x86_sse4_1()) {
       xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul16;
-      xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_gemm_sse2_params;
+      xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_gemm_sse4_params;
       xnn_params.qs8.dwconv[0].channel_tile = 8;
       xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_ukernel_up8x25__sse41_mul16;
-      xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_gemm_sse2_params;
+      xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_gemm_sse4_params;
       xnn_params.qs8.dwconv[1].channel_tile = 8;
     } else if (cpuinfo_has_x86_ssse3()) {
       xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_ukernel_up8x9__ssse3_mul16;
diff --git a/src/qs8-dwconv/gen/up16x25-minmax-avx-mul16.c b/src/qs8-dwconv/gen/up16x25-minmax-avx-mul16.c
index fd78bf2..201e81c 100644
--- a/src/qs8-dwconv/gen/up16x25-minmax-avx-mul16.c
+++ b/src/qs8-dwconv/gen/up16x25-minmax-avx-mul16.c
@@ -693,8 +693,8 @@
 
       w = (const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 400 * sizeof(int8_t));
 
-      const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-      const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+      const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+      const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
       const __m128i vacc13 = _mm_shuffle_epi32(vacc0123, _MM_SHUFFLE(3, 3, 1, 1));
       const __m128i vprod02 = _mm_add_epi64(_mm_mul_epi32(vacc0123, vmultiplier), vrounding);
@@ -723,7 +723,7 @@
       const __m128i vq31prod89AB = _mm_blend_epi16(vq31prod8A, vq31prod9B, 0xCC);
       const __m128i vq31prodCDEF = _mm_blend_epi16(vq31prodCE, vq31prodDF, 0xCC);
 
-      const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+      const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
       const __m128i vrem0123 =
         _mm_add_epi32(_mm_and_si128(vq31prod0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0123));
       const __m128i vrem4567 =
@@ -733,8 +733,8 @@
       const __m128i vremCDEF =
         _mm_add_epi32(_mm_and_si128(vq31prodCDEF, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prodCDEF));
 
-      const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-      const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+      const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+      const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
       vacc0123 =
         _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
       vacc4567 =
@@ -744,17 +744,19 @@
       vaccCDEF =
         _mm_sub_epi32(_mm_sra_epi32(vq31prodCDEF, vshift), _mm_cmpgt_epi32(vremCDEF, vremainder_threshold));
 
-      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
       __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
 
-      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-      vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
-      vout89ABCDEF = _mm_min_epi16(_mm_max_epi16(vout89ABCDEF, voutput_min), voutput_max);
 
       __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF);
 
+      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4.output_min);
+      vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min);
+
+      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse4.output_max);
+      vout0123456789ABCDEF = _mm_min_epi8(vout0123456789ABCDEF, voutput_max);
+
       _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
       output += 16;
     }
@@ -1093,8 +1095,8 @@
         w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t));
         k += 8;
 
-        const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-        const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+        const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+        const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
         const __m128i vacc13 = _mm_shuffle_epi32(vacc0123, _MM_SHUFFLE(3, 3, 1, 1));
         const __m128i vacc57 = _mm_shuffle_epi32(vacc4567, _MM_SHUFFLE(3, 3, 1, 1));
@@ -1113,28 +1115,28 @@
         const __m128i vq31prod0123 = _mm_blend_epi16(vq31prod02, vq31prod13, 0xCC);
         const __m128i vq31prod4567 = _mm_blend_epi16(vq31prod46, vq31prod57, 0xCC);
 
-        const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+        const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
         const __m128i vrem0123 =
           _mm_add_epi32(_mm_and_si128(vq31prod0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0123));
         const __m128i vrem4567 =
           _mm_add_epi32(_mm_and_si128(vq31prod4567, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod4567));
 
-        const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-        const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+        const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+        const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
         vacc0123 =
           _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
         vacc4567 =
           _mm_sub_epi32(_mm_sra_epi32(vq31prod4567, vshift), _mm_cmpgt_epi32(vrem4567, vremainder_threshold));
 
-        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
         __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
 
-        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-        vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
 
         __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
 
+        vout0123456701234567 = _mm_max_epi8(vout0123456701234567, _mm_load_si128((const __m128i*) params->sse4.output_min));
+        vout0123456701234567 = _mm_min_epi8(vout0123456701234567, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
         if XNN_LIKELY(c >= 8) {
           _mm_storel_epi64((__m128i*) output, vout0123456701234567);
           output += 8;
diff --git a/src/qs8-dwconv/gen/up16x25-minmax-avx-mul32.c b/src/qs8-dwconv/gen/up16x25-minmax-avx-mul32.c
index 58e7828..715c5ab 100644
--- a/src/qs8-dwconv/gen/up16x25-minmax-avx-mul32.c
+++ b/src/qs8-dwconv/gen/up16x25-minmax-avx-mul32.c
@@ -544,8 +544,8 @@
 
       w = (const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 400 * sizeof(int8_t));
 
-      const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-      const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+      const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+      const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
       const __m128i vacc13 = _mm_shuffle_epi32(vacc0123, _MM_SHUFFLE(3, 3, 1, 1));
       const __m128i vprod02 = _mm_add_epi64(_mm_mul_epi32(vacc0123, vmultiplier), vrounding);
@@ -574,7 +574,7 @@
       const __m128i vq31prod89AB = _mm_blend_epi16(vq31prod8A, vq31prod9B, 0xCC);
       const __m128i vq31prodCDEF = _mm_blend_epi16(vq31prodCE, vq31prodDF, 0xCC);
 
-      const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+      const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
       const __m128i vrem0123 =
         _mm_add_epi32(_mm_and_si128(vq31prod0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0123));
       const __m128i vrem4567 =
@@ -584,8 +584,8 @@
       const __m128i vremCDEF =
         _mm_add_epi32(_mm_and_si128(vq31prodCDEF, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prodCDEF));
 
-      const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-      const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+      const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+      const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
       vacc0123 =
         _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
       vacc4567 =
@@ -595,16 +595,15 @@
       vaccCDEF =
         _mm_sub_epi32(_mm_sra_epi32(vq31prodCDEF, vshift), _mm_cmpgt_epi32(vremCDEF, vremainder_threshold));
 
-      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
       __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
 
-      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-      vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
-      vout89ABCDEF = _mm_min_epi16(_mm_max_epi16(vout89ABCDEF, voutput_min), voutput_max);
-
+      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4.output_min);
+      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse4.output_max);
       __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF);
+      vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min);
+      vout0123456789ABCDEF = _mm_min_epi8(vout0123456789ABCDEF, voutput_max);
 
       _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
       output += 16;
@@ -768,8 +767,8 @@
         w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
         k += 4;
 
-        const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-        const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+        const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+        const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
         const __m128i vacc13 = _mm_shuffle_epi32(vacc0123, _MM_SHUFFLE(3, 3, 1, 1));
 
@@ -781,23 +780,21 @@
 
         const __m128i vq31prod0123 = _mm_blend_epi16(vq31prod02, vq31prod13, 0xCC);
 
-        const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+        const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
         const __m128i vrem0123 =
           _mm_add_epi32(_mm_and_si128(vq31prod0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0123));
 
-        const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-        const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+        const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+        const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
         vacc0123 =
           _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
 
-        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
         __m128i vout0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc0123), voutput_zero_point);
 
-        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-        vout0123 = _mm_min_epi16(_mm_max_epi16(vout0123, voutput_min), voutput_max);
-
         vout0123 = _mm_packs_epi16(vout0123, vout0123);
+        vout0123 = _mm_max_epi8(vout0123, _mm_load_si128((const __m128i*) params->sse4.output_min));
+        vout0123 = _mm_min_epi8(vout0123, _mm_load_si128((const __m128i*) params->sse4.output_max));
 
         if XNN_LIKELY(c >= 4) {
           _mm_storeu_si32(output, vout0123);
diff --git a/src/qs8-dwconv/gen/up16x25-minmax-avx2-mul16.c b/src/qs8-dwconv/gen/up16x25-minmax-avx2-mul16.c
index 74b833a..15c168f 100644
--- a/src/qs8-dwconv/gen/up16x25-minmax-avx2-mul16.c
+++ b/src/qs8-dwconv/gen/up16x25-minmax-avx2-mul16.c
@@ -391,8 +391,8 @@
 
       w = (const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 400 * sizeof(int8_t));
 
-      const __m256i vmultiplier = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.multiplier));
-      const __m256i vrounding = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.rounding));
+      const __m256i vmultiplier = _mm256_load_si256((const __m256i*) params->avx2.multiplier);
+      const __m256i vrounding = _mm256_load_si256((const __m256i*) params->avx2.rounding);
 
       const __m256i vacc1357 = _mm256_shuffle_epi32(vacc01234567, _MM_SHUFFLE(3, 3, 1, 1));
       const __m256i vacc9BDF = _mm256_shuffle_epi32(vacc89ABCDEF, _MM_SHUFFLE(3, 3, 1, 1));
@@ -410,28 +410,29 @@
       const __m256i vq31prod01234567 = _mm256_blend_epi16(vq31prod0246, vq31prod1357, 0xCC);
       const __m256i vq31prod89ABCDEF = _mm256_blend_epi16(vq31prod8ACE, vq31prod9BDF, 0xCC);
 
-      const __m256i vremainder_mask = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_mask));
+      const __m256i vremainder_mask = _mm256_load_si256((const __m256i*) params->avx2.remainder_mask);
       const __m256i vrem01234567 =
         _mm256_add_epi32(_mm256_and_si256(vq31prod01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod01234567));
       const __m256i vrem89ABCDEF =
         _mm256_add_epi32(_mm256_and_si256(vq31prod89ABCDEF, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod89ABCDEF));
 
-      const __m256i vremainder_threshold = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_threshold));
-      const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+      const __m256i vremainder_threshold = _mm256_load_si256((const __m256i*) params->avx2.remainder_threshold);
+      const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->avx2.shift);
       vacc01234567 =
         _mm256_sub_epi32(_mm256_sra_epi32(vq31prod01234567, vshift), _mm256_cmpgt_epi32(vrem01234567, vremainder_threshold));
       vacc89ABCDEF =
         _mm256_sub_epi32(_mm256_sra_epi32(vq31prod89ABCDEF, vshift), _mm256_cmpgt_epi32(vrem89ABCDEF, vremainder_threshold));
 
-      const __m256i voutput_zero_point = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_zero_point));
-      __m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(vacc01234567, vacc89ABCDEF), voutput_zero_point);
-
-      const __m256i voutput_min = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_min));
-      const __m256i voutput_max = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_max));
-      vout012389AB4567CDEF = _mm256_min_epi16(_mm256_max_epi16(vout012389AB4567CDEF, voutput_min), voutput_max);
+      const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx2.output_zero_point);
+      const __m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(vacc01234567, vacc89ABCDEF), voutput_zero_point);
 
       __m128i vout0123456789ABCDEF = _mm_shuffle_epi32(_mm_packs_epi16(_mm256_castsi256_si128(vout012389AB4567CDEF), _mm256_extracti128_si256(vout012389AB4567CDEF, 1)), _MM_SHUFFLE(3, 1, 2, 0));
 
+      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx2.output_min);
+      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->avx2.output_max);
+      vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min);
+      vout0123456789ABCDEF = _mm_min_epi8(vout0123456789ABCDEF, voutput_max);
+
       _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
       output += 16;
     }
@@ -642,8 +643,8 @@
         vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_cvtepi16_epi32(vprod24x89ABCDEF));
 
 
-        const __m256i vmultiplier = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.multiplier));
-        const __m256i vrounding = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.rounding));
+        const __m256i vmultiplier = _mm256_load_si256((const __m256i*) params->avx2.multiplier);
+        const __m256i vrounding = _mm256_load_si256((const __m256i*) params->avx2.rounding);
 
         const __m256i vacc1357 = _mm256_shuffle_epi32(vacc01234567, _MM_SHUFFLE(3, 3, 1, 1));
         const __m256i vacc9BDF = _mm256_shuffle_epi32(vacc89ABCDEF, _MM_SHUFFLE(3, 3, 1, 1));
@@ -661,29 +662,28 @@
         const __m256i vq31prod01234567 = _mm256_blend_epi16(vq31prod0246, vq31prod1357, 0xCC);
         const __m256i vq31prod89ABCDEF = _mm256_blend_epi16(vq31prod8ACE, vq31prod9BDF, 0xCC);
 
-        const __m256i vremainder_mask = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_mask));
+        const __m256i vremainder_mask = _mm256_load_si256((const __m256i*) params->avx2.remainder_mask);
         const __m256i vrem01234567 =
           _mm256_add_epi32(_mm256_and_si256(vq31prod01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod01234567));
         const __m256i vrem89ABCDEF =
           _mm256_add_epi32(_mm256_and_si256(vq31prod89ABCDEF, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod89ABCDEF));
 
-        const __m256i vremainder_threshold = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_threshold));
-        const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+        const __m256i vremainder_threshold = _mm256_load_si256((const __m256i*) params->avx2.remainder_threshold);
+        const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->avx2.shift);
         vacc01234567 =
           _mm256_sub_epi32(_mm256_sra_epi32(vq31prod01234567, vshift), _mm256_cmpgt_epi32(vrem01234567, vremainder_threshold));
         vacc89ABCDEF =
           _mm256_sub_epi32(_mm256_sra_epi32(vq31prod89ABCDEF, vshift), _mm256_cmpgt_epi32(vrem89ABCDEF, vremainder_threshold));
 
-        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->avx2.output_zero_point);
         __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extracti128_si256(vacc01234567, 1)), voutput_zero_point);
         __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc89ABCDEF), _mm256_extracti128_si256(vacc89ABCDEF, 1)), voutput_zero_point);
 
-        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-        vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
-        vout89ABCDEF = _mm_min_epi16(_mm_max_epi16(vout89ABCDEF, voutput_min), voutput_max);
+        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx2.output_min);
+        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->avx2.output_max);
 
         __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF);
+        vout0123456789ABCDEF = _mm_min_epi8(_mm_max_epi8(vout0123456789ABCDEF, voutput_min), voutput_max);
 
         if (c & 8) {
           _mm_storel_epi64((__m128i*) output, vout0123456789ABCDEF);
diff --git a/src/qs8-dwconv/gen/up16x25-minmax-avx2-mul32.c b/src/qs8-dwconv/gen/up16x25-minmax-avx2-mul32.c
index 47c0ad6..37d3bcc 100644
--- a/src/qs8-dwconv/gen/up16x25-minmax-avx2-mul32.c
+++ b/src/qs8-dwconv/gen/up16x25-minmax-avx2-mul32.c
@@ -391,8 +391,8 @@
 
       w = (const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 400 * sizeof(int8_t));
 
-      const __m256i vmultiplier = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.multiplier));
-      const __m256i vrounding = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.rounding));
+      const __m256i vmultiplier = _mm256_load_si256((const __m256i*) params->avx2.multiplier);
+      const __m256i vrounding = _mm256_load_si256((const __m256i*) params->avx2.rounding);
 
       const __m256i vacc1357 = _mm256_shuffle_epi32(vacc01234567, _MM_SHUFFLE(3, 3, 1, 1));
       const __m256i vacc9BDF = _mm256_shuffle_epi32(vacc89ABCDEF, _MM_SHUFFLE(3, 3, 1, 1));
@@ -410,28 +410,29 @@
       const __m256i vq31prod01234567 = _mm256_blend_epi16(vq31prod0246, vq31prod1357, 0xCC);
       const __m256i vq31prod89ABCDEF = _mm256_blend_epi16(vq31prod8ACE, vq31prod9BDF, 0xCC);
 
-      const __m256i vremainder_mask = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_mask));
+      const __m256i vremainder_mask = _mm256_load_si256((const __m256i*) params->avx2.remainder_mask);
       const __m256i vrem01234567 =
         _mm256_add_epi32(_mm256_and_si256(vq31prod01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod01234567));
       const __m256i vrem89ABCDEF =
         _mm256_add_epi32(_mm256_and_si256(vq31prod89ABCDEF, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod89ABCDEF));
 
-      const __m256i vremainder_threshold = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_threshold));
-      const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+      const __m256i vremainder_threshold = _mm256_load_si256((const __m256i*) params->avx2.remainder_threshold);
+      const __m128i vshift = _mm_load_si128((const __m128i*) params->avx2.shift);
       vacc01234567 =
         _mm256_sub_epi32(_mm256_sra_epi32(vq31prod01234567, vshift), _mm256_cmpgt_epi32(vrem01234567, vremainder_threshold));
       vacc89ABCDEF =
         _mm256_sub_epi32(_mm256_sra_epi32(vq31prod89ABCDEF, vshift), _mm256_cmpgt_epi32(vrem89ABCDEF, vremainder_threshold));
 
-      const __m256i voutput_zero_point = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_zero_point));
+      const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx2.output_zero_point);
       __m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(vacc01234567, vacc89ABCDEF), voutput_zero_point);
 
-      const __m256i voutput_min = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_min));
-      const __m256i voutput_max = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_max));
-      vout012389AB4567CDEF = _mm256_min_epi16(_mm256_max_epi16(vout012389AB4567CDEF, voutput_min), voutput_max);
-
       __m128i vout0123456789ABCDEF = _mm_shuffle_epi32(_mm_packs_epi16(_mm256_castsi256_si128(vout012389AB4567CDEF), _mm256_extracti128_si256(vout012389AB4567CDEF, 1)), _MM_SHUFFLE(3, 1, 2, 0));
 
+      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx2.output_min);
+      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->avx2.output_max);
+      vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min);
+      vout0123456789ABCDEF = _mm_min_epi8(vout0123456789ABCDEF, voutput_max);
+
       _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
       output += 16;
     }
@@ -594,8 +595,8 @@
         w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t));
         k += 8;
 
-        const __m256i vmultiplier = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.multiplier));
-        const __m256i vrounding = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.rounding));
+        const __m256i vmultiplier = _mm256_load_si256((const __m256i*) params->avx2.multiplier);
+        const __m256i vrounding = _mm256_load_si256((const __m256i*) params->avx2.rounding);
 
         const __m256i vacc1357 = _mm256_shuffle_epi32(vacc01234567, _MM_SHUFFLE(3, 3, 1, 1));
 
@@ -607,23 +608,23 @@
 
         const __m256i vq31prod01234567 = _mm256_blend_epi16(vq31prod0246, vq31prod1357, 0xCC);
 
-        const __m256i vremainder_mask = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_mask));
+        const __m256i vremainder_mask = _mm256_load_si256((const __m256i*) params->avx2.remainder_mask);
         const __m256i vrem01234567 =
           _mm256_add_epi32(_mm256_and_si256(vq31prod01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod01234567));
 
-        const __m256i vremainder_threshold = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_threshold));
-        const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+        const __m256i vremainder_threshold = _mm256_load_si256((const __m256i*) params->avx2.remainder_threshold);
+        const __m128i vshift = _mm_load_si128((const __m128i*) params->avx2.shift);
         vacc01234567 =
           _mm256_sub_epi32(_mm256_sra_epi32(vq31prod01234567, vshift), _mm256_cmpgt_epi32(vrem01234567, vremainder_threshold));
 
-        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->avx2.output_zero_point);
         __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extracti128_si256(vacc01234567, 1)), voutput_zero_point);
 
-        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-        vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
-
         __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
+        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->avx2.output_max);
+        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx2.output_min);
+        vout0123456701234567 = _mm_min_epi8(vout0123456701234567, voutput_max);
+        vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
 
         if XNN_LIKELY(c >= 8) {
           _mm_storel_epi64((__m128i*) output, vout0123456701234567);
diff --git a/src/qs8-dwconv/gen/up16x25-minmax-sse2-mul16.c b/src/qs8-dwconv/gen/up16x25-minmax-sse2-mul16.c
index b9a5594..13a5160 100644
--- a/src/qs8-dwconv/gen/up16x25-minmax-sse2-mul16.c
+++ b/src/qs8-dwconv/gen/up16x25-minmax-sse2-mul16.c
@@ -771,7 +771,7 @@
         _mm_add_epi32(_mm_and_si128(vq31prodCDEF, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prodCDEF));
 
       const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-      const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+      const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse2.shift);
       vacc0123 =
         _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
       vacc4567 =
@@ -786,12 +786,16 @@
       __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
 
       const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
+      vout01234567 = _mm_max_epi16(vout01234567, voutput_min);
+      vout89ABCDEF = _mm_max_epi16(vout89ABCDEF, voutput_min);
+
       const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-      vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
-      vout89ABCDEF = _mm_min_epi16(_mm_max_epi16(vout89ABCDEF, voutput_min), voutput_max);
+      vout01234567 = _mm_min_epi16(vout01234567, voutput_max);
+      vout89ABCDEF = _mm_min_epi16(vout89ABCDEF, voutput_max);
 
       __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF);
 
+
       _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
       output += 16;
     }
@@ -1177,7 +1181,7 @@
           _mm_add_epi32(_mm_and_si128(vq31prod4567, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod4567));
 
         const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-        const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+        const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse2.shift);
         vacc0123 =
           _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
         vacc4567 =
@@ -1186,12 +1190,12 @@
         const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
         __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
 
-        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-        vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
+        vout01234567 = _mm_max_epi16(vout01234567, _mm_load_si128((const __m128i*) params->sse2.output_min));
+        vout01234567 = _mm_min_epi16(vout01234567, _mm_load_si128((const __m128i*) params->sse2.output_max));
 
         __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
 
+
         if XNN_LIKELY(c >= 8) {
           _mm_storel_epi64((__m128i*) output, vout0123456701234567);
           output += 8;
diff --git a/src/qs8-dwconv/gen/up16x25-minmax-sse41-mul16.c b/src/qs8-dwconv/gen/up16x25-minmax-sse41-mul16.c
index 528e594..16037c0 100644
--- a/src/qs8-dwconv/gen/up16x25-minmax-sse41-mul16.c
+++ b/src/qs8-dwconv/gen/up16x25-minmax-sse41-mul16.c
@@ -693,8 +693,8 @@
 
       w = (const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 400 * sizeof(int8_t));
 
-      const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-      const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+      const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+      const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
       const __m128i vacc13 = _mm_shuffle_epi32(vacc0123, _MM_SHUFFLE(3, 3, 1, 1));
       const __m128i vprod02 = _mm_add_epi64(_mm_mul_epi32(vacc0123, vmultiplier), vrounding);
@@ -723,7 +723,7 @@
       const __m128i vq31prod89AB = _mm_blend_epi16(vq31prod8A, vq31prod9B, 0xCC);
       const __m128i vq31prodCDEF = _mm_blend_epi16(vq31prodCE, vq31prodDF, 0xCC);
 
-      const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+      const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
       const __m128i vrem0123 =
         _mm_add_epi32(_mm_and_si128(vq31prod0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0123));
       const __m128i vrem4567 =
@@ -733,8 +733,8 @@
       const __m128i vremCDEF =
         _mm_add_epi32(_mm_and_si128(vq31prodCDEF, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prodCDEF));
 
-      const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-      const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+      const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+      const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
       vacc0123 =
         _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
       vacc4567 =
@@ -744,17 +744,19 @@
       vaccCDEF =
         _mm_sub_epi32(_mm_sra_epi32(vq31prodCDEF, vshift), _mm_cmpgt_epi32(vremCDEF, vremainder_threshold));
 
-      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
       __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
 
-      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-      vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
-      vout89ABCDEF = _mm_min_epi16(_mm_max_epi16(vout89ABCDEF, voutput_min), voutput_max);
 
       __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF);
 
+      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4.output_min);
+      vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min);
+
+      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse4.output_max);
+      vout0123456789ABCDEF = _mm_min_epi8(vout0123456789ABCDEF, voutput_max);
+
       _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
       output += 16;
     }
@@ -1093,8 +1095,8 @@
         w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t));
         k += 8;
 
-        const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-        const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+        const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+        const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
         const __m128i vacc13 = _mm_shuffle_epi32(vacc0123, _MM_SHUFFLE(3, 3, 1, 1));
         const __m128i vacc57 = _mm_shuffle_epi32(vacc4567, _MM_SHUFFLE(3, 3, 1, 1));
@@ -1113,28 +1115,28 @@
         const __m128i vq31prod0123 = _mm_blend_epi16(vq31prod02, vq31prod13, 0xCC);
         const __m128i vq31prod4567 = _mm_blend_epi16(vq31prod46, vq31prod57, 0xCC);
 
-        const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+        const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
         const __m128i vrem0123 =
           _mm_add_epi32(_mm_and_si128(vq31prod0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0123));
         const __m128i vrem4567 =
           _mm_add_epi32(_mm_and_si128(vq31prod4567, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod4567));
 
-        const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-        const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+        const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+        const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
         vacc0123 =
           _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
         vacc4567 =
           _mm_sub_epi32(_mm_sra_epi32(vq31prod4567, vshift), _mm_cmpgt_epi32(vrem4567, vremainder_threshold));
 
-        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
         __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
 
-        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-        vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
 
         __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
 
+        vout0123456701234567 = _mm_max_epi8(vout0123456701234567, _mm_load_si128((const __m128i*) params->sse4.output_min));
+        vout0123456701234567 = _mm_min_epi8(vout0123456701234567, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
         if XNN_LIKELY(c >= 8) {
           _mm_storel_epi64((__m128i*) output, vout0123456701234567);
           output += 8;
diff --git a/src/qs8-dwconv/gen/up16x25-minmax-sse41-mul32.c b/src/qs8-dwconv/gen/up16x25-minmax-sse41-mul32.c
index 5d0962f..f332747 100644
--- a/src/qs8-dwconv/gen/up16x25-minmax-sse41-mul32.c
+++ b/src/qs8-dwconv/gen/up16x25-minmax-sse41-mul32.c
@@ -544,8 +544,8 @@
 
       w = (const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 400 * sizeof(int8_t));
 
-      const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-      const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+      const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+      const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
       const __m128i vacc13 = _mm_shuffle_epi32(vacc0123, _MM_SHUFFLE(3, 3, 1, 1));
       const __m128i vprod02 = _mm_add_epi64(_mm_mul_epi32(vacc0123, vmultiplier), vrounding);
@@ -574,7 +574,7 @@
       const __m128i vq31prod89AB = _mm_blend_epi16(vq31prod8A, vq31prod9B, 0xCC);
       const __m128i vq31prodCDEF = _mm_blend_epi16(vq31prodCE, vq31prodDF, 0xCC);
 
-      const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+      const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
       const __m128i vrem0123 =
         _mm_add_epi32(_mm_and_si128(vq31prod0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0123));
       const __m128i vrem4567 =
@@ -584,8 +584,8 @@
       const __m128i vremCDEF =
         _mm_add_epi32(_mm_and_si128(vq31prodCDEF, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prodCDEF));
 
-      const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-      const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+      const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+      const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
       vacc0123 =
         _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
       vacc4567 =
@@ -595,16 +595,15 @@
       vaccCDEF =
         _mm_sub_epi32(_mm_sra_epi32(vq31prodCDEF, vshift), _mm_cmpgt_epi32(vremCDEF, vremainder_threshold));
 
-      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
       __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
 
-      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-      vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
-      vout89ABCDEF = _mm_min_epi16(_mm_max_epi16(vout89ABCDEF, voutput_min), voutput_max);
-
+      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4.output_min);
+      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse4.output_max);
       __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF);
+      vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min);
+      vout0123456789ABCDEF = _mm_min_epi8(vout0123456789ABCDEF, voutput_max);
 
       _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
       output += 16;
@@ -768,8 +767,8 @@
         w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
         k += 4;
 
-        const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-        const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+        const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+        const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
         const __m128i vacc13 = _mm_shuffle_epi32(vacc0123, _MM_SHUFFLE(3, 3, 1, 1));
 
@@ -781,23 +780,21 @@
 
         const __m128i vq31prod0123 = _mm_blend_epi16(vq31prod02, vq31prod13, 0xCC);
 
-        const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+        const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
         const __m128i vrem0123 =
           _mm_add_epi32(_mm_and_si128(vq31prod0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0123));
 
-        const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-        const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+        const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+        const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
         vacc0123 =
           _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
 
-        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
         __m128i vout0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc0123), voutput_zero_point);
 
-        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-        vout0123 = _mm_min_epi16(_mm_max_epi16(vout0123, voutput_min), voutput_max);
-
         vout0123 = _mm_packs_epi16(vout0123, vout0123);
+        vout0123 = _mm_max_epi8(vout0123, _mm_load_si128((const __m128i*) params->sse4.output_min));
+        vout0123 = _mm_min_epi8(vout0123, _mm_load_si128((const __m128i*) params->sse4.output_max));
 
         if XNN_LIKELY(c >= 4) {
           _mm_storeu_si32(output, vout0123);
diff --git a/src/qs8-dwconv/gen/up16x25-minmax-ssse3-mul16.c b/src/qs8-dwconv/gen/up16x25-minmax-ssse3-mul16.c
index 1b7af91..938d0e5 100644
--- a/src/qs8-dwconv/gen/up16x25-minmax-ssse3-mul16.c
+++ b/src/qs8-dwconv/gen/up16x25-minmax-ssse3-mul16.c
@@ -771,7 +771,7 @@
         _mm_add_epi32(_mm_and_si128(vq31prodCDEF, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prodCDEF));
 
       const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-      const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+      const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse2.shift);
       vacc0123 =
         _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
       vacc4567 =
@@ -786,12 +786,16 @@
       __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
 
       const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
+      vout01234567 = _mm_max_epi16(vout01234567, voutput_min);
+      vout89ABCDEF = _mm_max_epi16(vout89ABCDEF, voutput_min);
+
       const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-      vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
-      vout89ABCDEF = _mm_min_epi16(_mm_max_epi16(vout89ABCDEF, voutput_min), voutput_max);
+      vout01234567 = _mm_min_epi16(vout01234567, voutput_max);
+      vout89ABCDEF = _mm_min_epi16(vout89ABCDEF, voutput_max);
 
       __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF);
 
+
       _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
       output += 16;
     }
@@ -1177,7 +1181,7 @@
           _mm_add_epi32(_mm_and_si128(vq31prod4567, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod4567));
 
         const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-        const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+        const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse2.shift);
         vacc0123 =
           _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
         vacc4567 =
@@ -1186,12 +1190,12 @@
         const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
         __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
 
-        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-        vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
+        vout01234567 = _mm_max_epi16(vout01234567, _mm_load_si128((const __m128i*) params->sse2.output_min));
+        vout01234567 = _mm_min_epi16(vout01234567, _mm_load_si128((const __m128i*) params->sse2.output_max));
 
         __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
 
+
         if XNN_LIKELY(c >= 8) {
           _mm_storel_epi64((__m128i*) output, vout0123456701234567);
           output += 8;
diff --git a/src/qs8-dwconv/gen/up16x25-minmax-xop-mul32.c b/src/qs8-dwconv/gen/up16x25-minmax-xop-mul32.c
index 41f7791..9dc6211 100644
--- a/src/qs8-dwconv/gen/up16x25-minmax-xop-mul32.c
+++ b/src/qs8-dwconv/gen/up16x25-minmax-xop-mul32.c
@@ -549,8 +549,8 @@
 
       w = (const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 400 * sizeof(int8_t));
 
-      const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-      const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+      const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+      const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
       const __m128i vacc13 = _mm_shuffle_epi32(vacc0123, _MM_SHUFFLE(3, 3, 1, 1));
       const __m128i vprod02 = _mm_add_epi64(_mm_mul_epi32(vacc0123, vmultiplier), vrounding);
@@ -579,7 +579,7 @@
       const __m128i vq31prod89AB = _mm_blend_epi16(vq31prod8A, vq31prod9B, 0xCC);
       const __m128i vq31prodCDEF = _mm_blend_epi16(vq31prodCE, vq31prodDF, 0xCC);
 
-      const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+      const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
       const __m128i vrem0123 =
         _mm_add_epi32(_mm_and_si128(vq31prod0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0123));
       const __m128i vrem4567 =
@@ -589,8 +589,8 @@
       const __m128i vremCDEF =
         _mm_add_epi32(_mm_and_si128(vq31prodCDEF, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prodCDEF));
 
-      const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-      const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+      const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+      const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
       vacc0123 =
         _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
       vacc4567 =
@@ -600,16 +600,15 @@
       vaccCDEF =
         _mm_sub_epi32(_mm_sra_epi32(vq31prodCDEF, vshift), _mm_cmpgt_epi32(vremCDEF, vremainder_threshold));
 
-      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
       __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
 
-      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-      vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
-      vout89ABCDEF = _mm_min_epi16(_mm_max_epi16(vout89ABCDEF, voutput_min), voutput_max);
-
+      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4.output_min);
+      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse4.output_max);
       __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF);
+      vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min);
+      vout0123456789ABCDEF = _mm_min_epi8(vout0123456789ABCDEF, voutput_max);
 
       _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
       output += 16;
@@ -773,8 +772,8 @@
         w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
         k += 4;
 
-        const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-        const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+        const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+        const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
         const __m128i vacc13 = _mm_shuffle_epi32(vacc0123, _MM_SHUFFLE(3, 3, 1, 1));
 
@@ -786,23 +785,21 @@
 
         const __m128i vq31prod0123 = _mm_blend_epi16(vq31prod02, vq31prod13, 0xCC);
 
-        const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+        const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
         const __m128i vrem0123 =
           _mm_add_epi32(_mm_and_si128(vq31prod0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0123));
 
-        const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-        const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+        const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+        const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
         vacc0123 =
           _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
 
-        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
         __m128i vout0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc0123), voutput_zero_point);
 
-        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-        vout0123 = _mm_min_epi16(_mm_max_epi16(vout0123, voutput_min), voutput_max);
-
         vout0123 = _mm_packs_epi16(vout0123, vout0123);
+        vout0123 = _mm_max_epi8(vout0123, _mm_load_si128((const __m128i*) params->sse4.output_min));
+        vout0123 = _mm_min_epi8(vout0123, _mm_load_si128((const __m128i*) params->sse4.output_max));
 
         if XNN_LIKELY(c >= 4) {
           _mm_storeu_si32(output, vout0123);
diff --git a/src/qs8-dwconv/gen/up16x9-minmax-avx-mul16.c b/src/qs8-dwconv/gen/up16x9-minmax-avx-mul16.c
index c125f1f..a8bb56a 100644
--- a/src/qs8-dwconv/gen/up16x9-minmax-avx-mul16.c
+++ b/src/qs8-dwconv/gen/up16x9-minmax-avx-mul16.c
@@ -277,8 +277,8 @@
 
       w = (const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 144 * sizeof(int8_t));
 
-      const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-      const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+      const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+      const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
       const __m128i vacc13 = _mm_shuffle_epi32(vacc0123, _MM_SHUFFLE(3, 3, 1, 1));
       const __m128i vprod02 = _mm_add_epi64(_mm_mul_epi32(vacc0123, vmultiplier), vrounding);
@@ -307,7 +307,7 @@
       const __m128i vq31prod89AB = _mm_blend_epi16(vq31prod8A, vq31prod9B, 0xCC);
       const __m128i vq31prodCDEF = _mm_blend_epi16(vq31prodCE, vq31prodDF, 0xCC);
 
-      const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+      const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
       const __m128i vrem0123 =
         _mm_add_epi32(_mm_and_si128(vq31prod0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0123));
       const __m128i vrem4567 =
@@ -317,8 +317,8 @@
       const __m128i vremCDEF =
         _mm_add_epi32(_mm_and_si128(vq31prodCDEF, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prodCDEF));
 
-      const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-      const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+      const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+      const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
       vacc0123 =
         _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
       vacc4567 =
@@ -328,17 +328,19 @@
       vaccCDEF =
         _mm_sub_epi32(_mm_sra_epi32(vq31prodCDEF, vshift), _mm_cmpgt_epi32(vremCDEF, vremainder_threshold));
 
-      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
       __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
 
-      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-      vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
-      vout89ABCDEF = _mm_min_epi16(_mm_max_epi16(vout89ABCDEF, voutput_min), voutput_max);
 
       __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF);
 
+      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4.output_min);
+      vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min);
+
+      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse4.output_max);
+      vout0123456789ABCDEF = _mm_min_epi8(vout0123456789ABCDEF, voutput_max);
+
       _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
       output += 16;
     }
@@ -469,8 +471,8 @@
         w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t));
         k += 8;
 
-        const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-        const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+        const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+        const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
         const __m128i vacc13 = _mm_shuffle_epi32(vacc0123, _MM_SHUFFLE(3, 3, 1, 1));
         const __m128i vacc57 = _mm_shuffle_epi32(vacc4567, _MM_SHUFFLE(3, 3, 1, 1));
@@ -489,28 +491,28 @@
         const __m128i vq31prod0123 = _mm_blend_epi16(vq31prod02, vq31prod13, 0xCC);
         const __m128i vq31prod4567 = _mm_blend_epi16(vq31prod46, vq31prod57, 0xCC);
 
-        const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+        const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
         const __m128i vrem0123 =
           _mm_add_epi32(_mm_and_si128(vq31prod0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0123));
         const __m128i vrem4567 =
           _mm_add_epi32(_mm_and_si128(vq31prod4567, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod4567));
 
-        const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-        const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+        const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+        const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
         vacc0123 =
           _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
         vacc4567 =
           _mm_sub_epi32(_mm_sra_epi32(vq31prod4567, vshift), _mm_cmpgt_epi32(vrem4567, vremainder_threshold));
 
-        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
         __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
 
-        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-        vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
 
         __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
 
+        vout0123456701234567 = _mm_max_epi8(vout0123456701234567, _mm_load_si128((const __m128i*) params->sse4.output_min));
+        vout0123456701234567 = _mm_min_epi8(vout0123456701234567, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
         if XNN_LIKELY(c >= 8) {
           _mm_storel_epi64((__m128i*) output, vout0123456701234567);
           output += 8;
diff --git a/src/qs8-dwconv/gen/up16x9-minmax-avx-mul32.c b/src/qs8-dwconv/gen/up16x9-minmax-avx-mul32.c
index 19dae96..45c2fa8 100644
--- a/src/qs8-dwconv/gen/up16x9-minmax-avx-mul32.c
+++ b/src/qs8-dwconv/gen/up16x9-minmax-avx-mul32.c
@@ -224,8 +224,8 @@
 
       w = (const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 144 * sizeof(int8_t));
 
-      const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-      const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+      const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+      const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
       const __m128i vacc13 = _mm_shuffle_epi32(vacc0123, _MM_SHUFFLE(3, 3, 1, 1));
       const __m128i vprod02 = _mm_add_epi64(_mm_mul_epi32(vacc0123, vmultiplier), vrounding);
@@ -254,7 +254,7 @@
       const __m128i vq31prod89AB = _mm_blend_epi16(vq31prod8A, vq31prod9B, 0xCC);
       const __m128i vq31prodCDEF = _mm_blend_epi16(vq31prodCE, vq31prodDF, 0xCC);
 
-      const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+      const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
       const __m128i vrem0123 =
         _mm_add_epi32(_mm_and_si128(vq31prod0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0123));
       const __m128i vrem4567 =
@@ -264,8 +264,8 @@
       const __m128i vremCDEF =
         _mm_add_epi32(_mm_and_si128(vq31prodCDEF, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prodCDEF));
 
-      const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-      const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+      const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+      const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
       vacc0123 =
         _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
       vacc4567 =
@@ -275,16 +275,15 @@
       vaccCDEF =
         _mm_sub_epi32(_mm_sra_epi32(vq31prodCDEF, vshift), _mm_cmpgt_epi32(vremCDEF, vremainder_threshold));
 
-      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
       __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
 
-      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-      vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
-      vout89ABCDEF = _mm_min_epi16(_mm_max_epi16(vout89ABCDEF, voutput_min), voutput_max);
-
+      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4.output_min);
+      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse4.output_max);
       __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF);
+      vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min);
+      vout0123456789ABCDEF = _mm_min_epi8(vout0123456789ABCDEF, voutput_max);
 
       _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
       output += 16;
@@ -352,8 +351,8 @@
         w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
         k += 4;
 
-        const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-        const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+        const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+        const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
         const __m128i vacc13 = _mm_shuffle_epi32(vacc0123, _MM_SHUFFLE(3, 3, 1, 1));
 
@@ -365,23 +364,21 @@
 
         const __m128i vq31prod0123 = _mm_blend_epi16(vq31prod02, vq31prod13, 0xCC);
 
-        const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+        const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
         const __m128i vrem0123 =
           _mm_add_epi32(_mm_and_si128(vq31prod0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0123));
 
-        const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-        const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+        const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+        const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
         vacc0123 =
           _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
 
-        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
         __m128i vout0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc0123), voutput_zero_point);
 
-        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-        vout0123 = _mm_min_epi16(_mm_max_epi16(vout0123, voutput_min), voutput_max);
-
         vout0123 = _mm_packs_epi16(vout0123, vout0123);
+        vout0123 = _mm_max_epi8(vout0123, _mm_load_si128((const __m128i*) params->sse4.output_min));
+        vout0123 = _mm_min_epi8(vout0123, _mm_load_si128((const __m128i*) params->sse4.output_max));
 
         if XNN_LIKELY(c >= 4) {
           _mm_storeu_si32(output, vout0123);
diff --git a/src/qs8-dwconv/gen/up16x9-minmax-avx2-mul16.c b/src/qs8-dwconv/gen/up16x9-minmax-avx2-mul16.c
index 1dc427b..b1db9e6 100644
--- a/src/qs8-dwconv/gen/up16x9-minmax-avx2-mul16.c
+++ b/src/qs8-dwconv/gen/up16x9-minmax-avx2-mul16.c
@@ -167,8 +167,8 @@
 
       w = (const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 144 * sizeof(int8_t));
 
-      const __m256i vmultiplier = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.multiplier));
-      const __m256i vrounding = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.rounding));
+      const __m256i vmultiplier = _mm256_load_si256((const __m256i*) params->avx2.multiplier);
+      const __m256i vrounding = _mm256_load_si256((const __m256i*) params->avx2.rounding);
 
       const __m256i vacc1357 = _mm256_shuffle_epi32(vacc01234567, _MM_SHUFFLE(3, 3, 1, 1));
       const __m256i vacc9BDF = _mm256_shuffle_epi32(vacc89ABCDEF, _MM_SHUFFLE(3, 3, 1, 1));
@@ -186,28 +186,29 @@
       const __m256i vq31prod01234567 = _mm256_blend_epi16(vq31prod0246, vq31prod1357, 0xCC);
       const __m256i vq31prod89ABCDEF = _mm256_blend_epi16(vq31prod8ACE, vq31prod9BDF, 0xCC);
 
-      const __m256i vremainder_mask = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_mask));
+      const __m256i vremainder_mask = _mm256_load_si256((const __m256i*) params->avx2.remainder_mask);
       const __m256i vrem01234567 =
         _mm256_add_epi32(_mm256_and_si256(vq31prod01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod01234567));
       const __m256i vrem89ABCDEF =
         _mm256_add_epi32(_mm256_and_si256(vq31prod89ABCDEF, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod89ABCDEF));
 
-      const __m256i vremainder_threshold = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_threshold));
-      const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+      const __m256i vremainder_threshold = _mm256_load_si256((const __m256i*) params->avx2.remainder_threshold);
+      const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->avx2.shift);
       vacc01234567 =
         _mm256_sub_epi32(_mm256_sra_epi32(vq31prod01234567, vshift), _mm256_cmpgt_epi32(vrem01234567, vremainder_threshold));
       vacc89ABCDEF =
         _mm256_sub_epi32(_mm256_sra_epi32(vq31prod89ABCDEF, vshift), _mm256_cmpgt_epi32(vrem89ABCDEF, vremainder_threshold));
 
-      const __m256i voutput_zero_point = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_zero_point));
-      __m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(vacc01234567, vacc89ABCDEF), voutput_zero_point);
-
-      const __m256i voutput_min = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_min));
-      const __m256i voutput_max = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_max));
-      vout012389AB4567CDEF = _mm256_min_epi16(_mm256_max_epi16(vout012389AB4567CDEF, voutput_min), voutput_max);
+      const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx2.output_zero_point);
+      const __m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(vacc01234567, vacc89ABCDEF), voutput_zero_point);
 
       __m128i vout0123456789ABCDEF = _mm_shuffle_epi32(_mm_packs_epi16(_mm256_castsi256_si128(vout012389AB4567CDEF), _mm256_extracti128_si256(vout012389AB4567CDEF, 1)), _MM_SHUFFLE(3, 1, 2, 0));
 
+      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx2.output_min);
+      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->avx2.output_max);
+      vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min);
+      vout0123456789ABCDEF = _mm_min_epi8(vout0123456789ABCDEF, voutput_max);
+
       _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
       output += 16;
     }
@@ -290,8 +291,8 @@
         vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_cvtepi16_epi32(vprod8x89ABCDEF));
 
 
-        const __m256i vmultiplier = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.multiplier));
-        const __m256i vrounding = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.rounding));
+        const __m256i vmultiplier = _mm256_load_si256((const __m256i*) params->avx2.multiplier);
+        const __m256i vrounding = _mm256_load_si256((const __m256i*) params->avx2.rounding);
 
         const __m256i vacc1357 = _mm256_shuffle_epi32(vacc01234567, _MM_SHUFFLE(3, 3, 1, 1));
         const __m256i vacc9BDF = _mm256_shuffle_epi32(vacc89ABCDEF, _MM_SHUFFLE(3, 3, 1, 1));
@@ -309,29 +310,28 @@
         const __m256i vq31prod01234567 = _mm256_blend_epi16(vq31prod0246, vq31prod1357, 0xCC);
         const __m256i vq31prod89ABCDEF = _mm256_blend_epi16(vq31prod8ACE, vq31prod9BDF, 0xCC);
 
-        const __m256i vremainder_mask = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_mask));
+        const __m256i vremainder_mask = _mm256_load_si256((const __m256i*) params->avx2.remainder_mask);
         const __m256i vrem01234567 =
           _mm256_add_epi32(_mm256_and_si256(vq31prod01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod01234567));
         const __m256i vrem89ABCDEF =
           _mm256_add_epi32(_mm256_and_si256(vq31prod89ABCDEF, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod89ABCDEF));
 
-        const __m256i vremainder_threshold = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_threshold));
-        const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+        const __m256i vremainder_threshold = _mm256_load_si256((const __m256i*) params->avx2.remainder_threshold);
+        const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->avx2.shift);
         vacc01234567 =
           _mm256_sub_epi32(_mm256_sra_epi32(vq31prod01234567, vshift), _mm256_cmpgt_epi32(vrem01234567, vremainder_threshold));
         vacc89ABCDEF =
           _mm256_sub_epi32(_mm256_sra_epi32(vq31prod89ABCDEF, vshift), _mm256_cmpgt_epi32(vrem89ABCDEF, vremainder_threshold));
 
-        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->avx2.output_zero_point);
         __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extracti128_si256(vacc01234567, 1)), voutput_zero_point);
         __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc89ABCDEF), _mm256_extracti128_si256(vacc89ABCDEF, 1)), voutput_zero_point);
 
-        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-        vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
-        vout89ABCDEF = _mm_min_epi16(_mm_max_epi16(vout89ABCDEF, voutput_min), voutput_max);
+        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx2.output_min);
+        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->avx2.output_max);
 
         __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF);
+        vout0123456789ABCDEF = _mm_min_epi8(_mm_max_epi8(vout0123456789ABCDEF, voutput_min), voutput_max);
 
         if (c & 8) {
           _mm_storel_epi64((__m128i*) output, vout0123456789ABCDEF);
diff --git a/src/qs8-dwconv/gen/up16x9-minmax-avx2-mul32.c b/src/qs8-dwconv/gen/up16x9-minmax-avx2-mul32.c
index c23fc81..b03ec06 100644
--- a/src/qs8-dwconv/gen/up16x9-minmax-avx2-mul32.c
+++ b/src/qs8-dwconv/gen/up16x9-minmax-avx2-mul32.c
@@ -167,8 +167,8 @@
 
       w = (const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 144 * sizeof(int8_t));
 
-      const __m256i vmultiplier = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.multiplier));
-      const __m256i vrounding = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.rounding));
+      const __m256i vmultiplier = _mm256_load_si256((const __m256i*) params->avx2.multiplier);
+      const __m256i vrounding = _mm256_load_si256((const __m256i*) params->avx2.rounding);
 
       const __m256i vacc1357 = _mm256_shuffle_epi32(vacc01234567, _MM_SHUFFLE(3, 3, 1, 1));
       const __m256i vacc9BDF = _mm256_shuffle_epi32(vacc89ABCDEF, _MM_SHUFFLE(3, 3, 1, 1));
@@ -186,28 +186,29 @@
       const __m256i vq31prod01234567 = _mm256_blend_epi16(vq31prod0246, vq31prod1357, 0xCC);
       const __m256i vq31prod89ABCDEF = _mm256_blend_epi16(vq31prod8ACE, vq31prod9BDF, 0xCC);
 
-      const __m256i vremainder_mask = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_mask));
+      const __m256i vremainder_mask = _mm256_load_si256((const __m256i*) params->avx2.remainder_mask);
       const __m256i vrem01234567 =
         _mm256_add_epi32(_mm256_and_si256(vq31prod01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod01234567));
       const __m256i vrem89ABCDEF =
         _mm256_add_epi32(_mm256_and_si256(vq31prod89ABCDEF, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod89ABCDEF));
 
-      const __m256i vremainder_threshold = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_threshold));
-      const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+      const __m256i vremainder_threshold = _mm256_load_si256((const __m256i*) params->avx2.remainder_threshold);
+      const __m128i vshift = _mm_load_si128((const __m128i*) params->avx2.shift);
       vacc01234567 =
         _mm256_sub_epi32(_mm256_sra_epi32(vq31prod01234567, vshift), _mm256_cmpgt_epi32(vrem01234567, vremainder_threshold));
       vacc89ABCDEF =
         _mm256_sub_epi32(_mm256_sra_epi32(vq31prod89ABCDEF, vshift), _mm256_cmpgt_epi32(vrem89ABCDEF, vremainder_threshold));
 
-      const __m256i voutput_zero_point = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_zero_point));
+      const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx2.output_zero_point);
       __m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(vacc01234567, vacc89ABCDEF), voutput_zero_point);
 
-      const __m256i voutput_min = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_min));
-      const __m256i voutput_max = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_max));
-      vout012389AB4567CDEF = _mm256_min_epi16(_mm256_max_epi16(vout012389AB4567CDEF, voutput_min), voutput_max);
-
       __m128i vout0123456789ABCDEF = _mm_shuffle_epi32(_mm_packs_epi16(_mm256_castsi256_si128(vout012389AB4567CDEF), _mm256_extracti128_si256(vout012389AB4567CDEF, 1)), _MM_SHUFFLE(3, 1, 2, 0));
 
+      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx2.output_min);
+      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->avx2.output_max);
+      vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min);
+      vout0123456789ABCDEF = _mm_min_epi8(vout0123456789ABCDEF, voutput_max);
+
       _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
       output += 16;
     }
@@ -274,8 +275,8 @@
         w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t));
         k += 8;
 
-        const __m256i vmultiplier = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.multiplier));
-        const __m256i vrounding = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.rounding));
+        const __m256i vmultiplier = _mm256_load_si256((const __m256i*) params->avx2.multiplier);
+        const __m256i vrounding = _mm256_load_si256((const __m256i*) params->avx2.rounding);
 
         const __m256i vacc1357 = _mm256_shuffle_epi32(vacc01234567, _MM_SHUFFLE(3, 3, 1, 1));
 
@@ -287,23 +288,23 @@
 
         const __m256i vq31prod01234567 = _mm256_blend_epi16(vq31prod0246, vq31prod1357, 0xCC);
 
-        const __m256i vremainder_mask = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_mask));
+        const __m256i vremainder_mask = _mm256_load_si256((const __m256i*) params->avx2.remainder_mask);
         const __m256i vrem01234567 =
           _mm256_add_epi32(_mm256_and_si256(vq31prod01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod01234567));
 
-        const __m256i vremainder_threshold = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_threshold));
-        const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+        const __m256i vremainder_threshold = _mm256_load_si256((const __m256i*) params->avx2.remainder_threshold);
+        const __m128i vshift = _mm_load_si128((const __m128i*) params->avx2.shift);
         vacc01234567 =
           _mm256_sub_epi32(_mm256_sra_epi32(vq31prod01234567, vshift), _mm256_cmpgt_epi32(vrem01234567, vremainder_threshold));
 
-        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->avx2.output_zero_point);
         __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extracti128_si256(vacc01234567, 1)), voutput_zero_point);
 
-        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-        vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
-
         __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
+        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->avx2.output_max);
+        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx2.output_min);
+        vout0123456701234567 = _mm_min_epi8(vout0123456701234567, voutput_max);
+        vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
 
         if XNN_LIKELY(c >= 8) {
           _mm_storel_epi64((__m128i*) output, vout0123456701234567);
diff --git a/src/qs8-dwconv/gen/up16x9-minmax-sse2-mul16.c b/src/qs8-dwconv/gen/up16x9-minmax-sse2-mul16.c
index ee07325..96df3cd 100644
--- a/src/qs8-dwconv/gen/up16x9-minmax-sse2-mul16.c
+++ b/src/qs8-dwconv/gen/up16x9-minmax-sse2-mul16.c
@@ -355,7 +355,7 @@
         _mm_add_epi32(_mm_and_si128(vq31prodCDEF, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prodCDEF));
 
       const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-      const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+      const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse2.shift);
       vacc0123 =
         _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
       vacc4567 =
@@ -370,12 +370,16 @@
       __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
 
       const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
+      vout01234567 = _mm_max_epi16(vout01234567, voutput_min);
+      vout89ABCDEF = _mm_max_epi16(vout89ABCDEF, voutput_min);
+
       const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-      vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
-      vout89ABCDEF = _mm_min_epi16(_mm_max_epi16(vout89ABCDEF, voutput_min), voutput_max);
+      vout01234567 = _mm_min_epi16(vout01234567, voutput_max);
+      vout89ABCDEF = _mm_min_epi16(vout89ABCDEF, voutput_max);
 
       __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF);
 
+
       _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
       output += 16;
     }
@@ -553,7 +557,7 @@
           _mm_add_epi32(_mm_and_si128(vq31prod4567, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod4567));
 
         const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-        const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+        const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse2.shift);
         vacc0123 =
           _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
         vacc4567 =
@@ -562,12 +566,12 @@
         const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
         __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
 
-        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-        vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
+        vout01234567 = _mm_max_epi16(vout01234567, _mm_load_si128((const __m128i*) params->sse2.output_min));
+        vout01234567 = _mm_min_epi16(vout01234567, _mm_load_si128((const __m128i*) params->sse2.output_max));
 
         __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
 
+
         if XNN_LIKELY(c >= 8) {
           _mm_storel_epi64((__m128i*) output, vout0123456701234567);
           output += 8;
diff --git a/src/qs8-dwconv/gen/up16x9-minmax-sse41-mul16.c b/src/qs8-dwconv/gen/up16x9-minmax-sse41-mul16.c
index 3e2f7f8..eadc573 100644
--- a/src/qs8-dwconv/gen/up16x9-minmax-sse41-mul16.c
+++ b/src/qs8-dwconv/gen/up16x9-minmax-sse41-mul16.c
@@ -277,8 +277,8 @@
 
       w = (const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 144 * sizeof(int8_t));
 
-      const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-      const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+      const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+      const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
       const __m128i vacc13 = _mm_shuffle_epi32(vacc0123, _MM_SHUFFLE(3, 3, 1, 1));
       const __m128i vprod02 = _mm_add_epi64(_mm_mul_epi32(vacc0123, vmultiplier), vrounding);
@@ -307,7 +307,7 @@
       const __m128i vq31prod89AB = _mm_blend_epi16(vq31prod8A, vq31prod9B, 0xCC);
       const __m128i vq31prodCDEF = _mm_blend_epi16(vq31prodCE, vq31prodDF, 0xCC);
 
-      const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+      const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
       const __m128i vrem0123 =
         _mm_add_epi32(_mm_and_si128(vq31prod0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0123));
       const __m128i vrem4567 =
@@ -317,8 +317,8 @@
       const __m128i vremCDEF =
         _mm_add_epi32(_mm_and_si128(vq31prodCDEF, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prodCDEF));
 
-      const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-      const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+      const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+      const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
       vacc0123 =
         _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
       vacc4567 =
@@ -328,17 +328,19 @@
       vaccCDEF =
         _mm_sub_epi32(_mm_sra_epi32(vq31prodCDEF, vshift), _mm_cmpgt_epi32(vremCDEF, vremainder_threshold));
 
-      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
       __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
 
-      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-      vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
-      vout89ABCDEF = _mm_min_epi16(_mm_max_epi16(vout89ABCDEF, voutput_min), voutput_max);
 
       __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF);
 
+      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4.output_min);
+      vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min);
+
+      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse4.output_max);
+      vout0123456789ABCDEF = _mm_min_epi8(vout0123456789ABCDEF, voutput_max);
+
       _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
       output += 16;
     }
@@ -469,8 +471,8 @@
         w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t));
         k += 8;
 
-        const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-        const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+        const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+        const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
         const __m128i vacc13 = _mm_shuffle_epi32(vacc0123, _MM_SHUFFLE(3, 3, 1, 1));
         const __m128i vacc57 = _mm_shuffle_epi32(vacc4567, _MM_SHUFFLE(3, 3, 1, 1));
@@ -489,28 +491,28 @@
         const __m128i vq31prod0123 = _mm_blend_epi16(vq31prod02, vq31prod13, 0xCC);
         const __m128i vq31prod4567 = _mm_blend_epi16(vq31prod46, vq31prod57, 0xCC);
 
-        const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+        const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
         const __m128i vrem0123 =
           _mm_add_epi32(_mm_and_si128(vq31prod0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0123));
         const __m128i vrem4567 =
           _mm_add_epi32(_mm_and_si128(vq31prod4567, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod4567));
 
-        const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-        const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+        const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+        const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
         vacc0123 =
           _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
         vacc4567 =
           _mm_sub_epi32(_mm_sra_epi32(vq31prod4567, vshift), _mm_cmpgt_epi32(vrem4567, vremainder_threshold));
 
-        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
         __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
 
-        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-        vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
 
         __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
 
+        vout0123456701234567 = _mm_max_epi8(vout0123456701234567, _mm_load_si128((const __m128i*) params->sse4.output_min));
+        vout0123456701234567 = _mm_min_epi8(vout0123456701234567, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
         if XNN_LIKELY(c >= 8) {
           _mm_storel_epi64((__m128i*) output, vout0123456701234567);
           output += 8;
diff --git a/src/qs8-dwconv/gen/up16x9-minmax-sse41-mul32.c b/src/qs8-dwconv/gen/up16x9-minmax-sse41-mul32.c
index 9cd4988..580cfb2 100644
--- a/src/qs8-dwconv/gen/up16x9-minmax-sse41-mul32.c
+++ b/src/qs8-dwconv/gen/up16x9-minmax-sse41-mul32.c
@@ -224,8 +224,8 @@
 
       w = (const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 144 * sizeof(int8_t));
 
-      const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-      const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+      const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+      const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
       const __m128i vacc13 = _mm_shuffle_epi32(vacc0123, _MM_SHUFFLE(3, 3, 1, 1));
       const __m128i vprod02 = _mm_add_epi64(_mm_mul_epi32(vacc0123, vmultiplier), vrounding);
@@ -254,7 +254,7 @@
       const __m128i vq31prod89AB = _mm_blend_epi16(vq31prod8A, vq31prod9B, 0xCC);
       const __m128i vq31prodCDEF = _mm_blend_epi16(vq31prodCE, vq31prodDF, 0xCC);
 
-      const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+      const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
       const __m128i vrem0123 =
         _mm_add_epi32(_mm_and_si128(vq31prod0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0123));
       const __m128i vrem4567 =
@@ -264,8 +264,8 @@
       const __m128i vremCDEF =
         _mm_add_epi32(_mm_and_si128(vq31prodCDEF, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prodCDEF));
 
-      const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-      const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+      const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+      const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
       vacc0123 =
         _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
       vacc4567 =
@@ -275,16 +275,15 @@
       vaccCDEF =
         _mm_sub_epi32(_mm_sra_epi32(vq31prodCDEF, vshift), _mm_cmpgt_epi32(vremCDEF, vremainder_threshold));
 
-      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
       __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
 
-      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-      vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
-      vout89ABCDEF = _mm_min_epi16(_mm_max_epi16(vout89ABCDEF, voutput_min), voutput_max);
-
+      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4.output_min);
+      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse4.output_max);
       __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF);
+      vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min);
+      vout0123456789ABCDEF = _mm_min_epi8(vout0123456789ABCDEF, voutput_max);
 
       _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
       output += 16;
@@ -352,8 +351,8 @@
         w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
         k += 4;
 
-        const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-        const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+        const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+        const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
         const __m128i vacc13 = _mm_shuffle_epi32(vacc0123, _MM_SHUFFLE(3, 3, 1, 1));
 
@@ -365,23 +364,21 @@
 
         const __m128i vq31prod0123 = _mm_blend_epi16(vq31prod02, vq31prod13, 0xCC);
 
-        const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+        const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
         const __m128i vrem0123 =
           _mm_add_epi32(_mm_and_si128(vq31prod0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0123));
 
-        const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-        const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+        const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+        const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
         vacc0123 =
           _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
 
-        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
         __m128i vout0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc0123), voutput_zero_point);
 
-        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-        vout0123 = _mm_min_epi16(_mm_max_epi16(vout0123, voutput_min), voutput_max);
-
         vout0123 = _mm_packs_epi16(vout0123, vout0123);
+        vout0123 = _mm_max_epi8(vout0123, _mm_load_si128((const __m128i*) params->sse4.output_min));
+        vout0123 = _mm_min_epi8(vout0123, _mm_load_si128((const __m128i*) params->sse4.output_max));
 
         if XNN_LIKELY(c >= 4) {
           _mm_storeu_si32(output, vout0123);
diff --git a/src/qs8-dwconv/gen/up16x9-minmax-ssse3-mul16.c b/src/qs8-dwconv/gen/up16x9-minmax-ssse3-mul16.c
index 90f98d8..3d148fe 100644
--- a/src/qs8-dwconv/gen/up16x9-minmax-ssse3-mul16.c
+++ b/src/qs8-dwconv/gen/up16x9-minmax-ssse3-mul16.c
@@ -355,7 +355,7 @@
         _mm_add_epi32(_mm_and_si128(vq31prodCDEF, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prodCDEF));
 
       const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-      const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+      const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse2.shift);
       vacc0123 =
         _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
       vacc4567 =
@@ -370,12 +370,16 @@
       __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
 
       const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
+      vout01234567 = _mm_max_epi16(vout01234567, voutput_min);
+      vout89ABCDEF = _mm_max_epi16(vout89ABCDEF, voutput_min);
+
       const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-      vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
-      vout89ABCDEF = _mm_min_epi16(_mm_max_epi16(vout89ABCDEF, voutput_min), voutput_max);
+      vout01234567 = _mm_min_epi16(vout01234567, voutput_max);
+      vout89ABCDEF = _mm_min_epi16(vout89ABCDEF, voutput_max);
 
       __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF);
 
+
       _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
       output += 16;
     }
@@ -553,7 +557,7 @@
           _mm_add_epi32(_mm_and_si128(vq31prod4567, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod4567));
 
         const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-        const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+        const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse2.shift);
         vacc0123 =
           _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
         vacc4567 =
@@ -562,12 +566,12 @@
         const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
         __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
 
-        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-        vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
+        vout01234567 = _mm_max_epi16(vout01234567, _mm_load_si128((const __m128i*) params->sse2.output_min));
+        vout01234567 = _mm_min_epi16(vout01234567, _mm_load_si128((const __m128i*) params->sse2.output_max));
 
         __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
 
+
         if XNN_LIKELY(c >= 8) {
           _mm_storel_epi64((__m128i*) output, vout0123456701234567);
           output += 8;
diff --git a/src/qs8-dwconv/gen/up16x9-minmax-xop-mul32.c b/src/qs8-dwconv/gen/up16x9-minmax-xop-mul32.c
index 9a1b0e1..d1590b2 100644
--- a/src/qs8-dwconv/gen/up16x9-minmax-xop-mul32.c
+++ b/src/qs8-dwconv/gen/up16x9-minmax-xop-mul32.c
@@ -229,8 +229,8 @@
 
       w = (const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 144 * sizeof(int8_t));
 
-      const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-      const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+      const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+      const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
       const __m128i vacc13 = _mm_shuffle_epi32(vacc0123, _MM_SHUFFLE(3, 3, 1, 1));
       const __m128i vprod02 = _mm_add_epi64(_mm_mul_epi32(vacc0123, vmultiplier), vrounding);
@@ -259,7 +259,7 @@
       const __m128i vq31prod89AB = _mm_blend_epi16(vq31prod8A, vq31prod9B, 0xCC);
       const __m128i vq31prodCDEF = _mm_blend_epi16(vq31prodCE, vq31prodDF, 0xCC);
 
-      const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+      const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
       const __m128i vrem0123 =
         _mm_add_epi32(_mm_and_si128(vq31prod0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0123));
       const __m128i vrem4567 =
@@ -269,8 +269,8 @@
       const __m128i vremCDEF =
         _mm_add_epi32(_mm_and_si128(vq31prodCDEF, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prodCDEF));
 
-      const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-      const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+      const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+      const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
       vacc0123 =
         _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
       vacc4567 =
@@ -280,16 +280,15 @@
       vaccCDEF =
         _mm_sub_epi32(_mm_sra_epi32(vq31prodCDEF, vshift), _mm_cmpgt_epi32(vremCDEF, vremainder_threshold));
 
-      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
       __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
 
-      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-      vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
-      vout89ABCDEF = _mm_min_epi16(_mm_max_epi16(vout89ABCDEF, voutput_min), voutput_max);
-
+      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4.output_min);
+      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse4.output_max);
       __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF);
+      vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min);
+      vout0123456789ABCDEF = _mm_min_epi8(vout0123456789ABCDEF, voutput_max);
 
       _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
       output += 16;
@@ -357,8 +356,8 @@
         w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
         k += 4;
 
-        const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-        const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+        const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+        const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
         const __m128i vacc13 = _mm_shuffle_epi32(vacc0123, _MM_SHUFFLE(3, 3, 1, 1));
 
@@ -370,23 +369,21 @@
 
         const __m128i vq31prod0123 = _mm_blend_epi16(vq31prod02, vq31prod13, 0xCC);
 
-        const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+        const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
         const __m128i vrem0123 =
           _mm_add_epi32(_mm_and_si128(vq31prod0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0123));
 
-        const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-        const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+        const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+        const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
         vacc0123 =
           _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
 
-        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
         __m128i vout0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc0123), voutput_zero_point);
 
-        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-        vout0123 = _mm_min_epi16(_mm_max_epi16(vout0123, voutput_min), voutput_max);
-
         vout0123 = _mm_packs_epi16(vout0123, vout0123);
+        vout0123 = _mm_max_epi8(vout0123, _mm_load_si128((const __m128i*) params->sse4.output_min));
+        vout0123 = _mm_min_epi8(vout0123, _mm_load_si128((const __m128i*) params->sse4.output_max));
 
         if XNN_LIKELY(c >= 4) {
           _mm_storeu_si32(output, vout0123);
diff --git a/src/qs8-dwconv/gen/up24x25-minmax-avx-mul16.c b/src/qs8-dwconv/gen/up24x25-minmax-avx-mul16.c
index 63b6dc2..5040e72 100644
--- a/src/qs8-dwconv/gen/up24x25-minmax-avx-mul16.c
+++ b/src/qs8-dwconv/gen/up24x25-minmax-avx-mul16.c
@@ -895,8 +895,8 @@
 
       w = (const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 600 * sizeof(int8_t));
 
-      const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-      const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+      const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+      const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
       const __m128i vacc13 = _mm_shuffle_epi32(vacc0123, _MM_SHUFFLE(3, 3, 1, 1));
       const __m128i vprod02 = _mm_add_epi64(_mm_mul_epi32(vacc0123, vmultiplier), vrounding);
@@ -937,7 +937,7 @@
       const __m128i vq31prodGHIJ = _mm_blend_epi16(vq31prodGI, vq31prodHJ, 0xCC);
       const __m128i vq31prodKLMN = _mm_blend_epi16(vq31prodKM, vq31prodLN, 0xCC);
 
-      const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+      const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
       const __m128i vrem0123 =
         _mm_add_epi32(_mm_and_si128(vq31prod0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0123));
       const __m128i vrem4567 =
@@ -951,8 +951,8 @@
       const __m128i vremKLMN =
         _mm_add_epi32(_mm_and_si128(vq31prodKLMN, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prodKLMN));
 
-      const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-      const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+      const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+      const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
       vacc0123 =
         _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
       vacc4567 =
@@ -966,20 +966,23 @@
       vaccKLMN =
         _mm_sub_epi32(_mm_sra_epi32(vq31prodKLMN, vshift), _mm_cmpgt_epi32(vremKLMN, vremainder_threshold));
 
-      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
       __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
       __m128i voutGHIJKLMN = _mm_adds_epi16(_mm_packs_epi32(vaccGHIJ, vaccKLMN), voutput_zero_point);
 
-      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-      vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
-      vout89ABCDEF = _mm_min_epi16(_mm_max_epi16(vout89ABCDEF, voutput_min), voutput_max);
-      voutGHIJKLMN = _mm_min_epi16(_mm_max_epi16(voutGHIJKLMN, voutput_min), voutput_max);
 
       __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF);
       __m128i voutGHIJKLMNGHIJKLMN = _mm_packs_epi16(voutGHIJKLMN, voutGHIJKLMN);
 
+      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4.output_min);
+      vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min);
+      voutGHIJKLMNGHIJKLMN = _mm_max_epi8(voutGHIJKLMNGHIJKLMN, voutput_min);
+
+      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse4.output_max);
+      vout0123456789ABCDEF = _mm_min_epi8(vout0123456789ABCDEF, voutput_max);
+      voutGHIJKLMNGHIJKLMN = _mm_min_epi8(voutGHIJKLMNGHIJKLMN, voutput_max);
+
       _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
       _mm_storel_epi64((__m128i*) (output + 16), voutGHIJKLMNGHIJKLMN);
       output += 24;
@@ -1319,8 +1322,8 @@
         w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t));
         k += 8;
 
-        const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-        const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+        const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+        const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
         const __m128i vacc13 = _mm_shuffle_epi32(vacc0123, _MM_SHUFFLE(3, 3, 1, 1));
         const __m128i vacc57 = _mm_shuffle_epi32(vacc4567, _MM_SHUFFLE(3, 3, 1, 1));
@@ -1339,28 +1342,28 @@
         const __m128i vq31prod0123 = _mm_blend_epi16(vq31prod02, vq31prod13, 0xCC);
         const __m128i vq31prod4567 = _mm_blend_epi16(vq31prod46, vq31prod57, 0xCC);
 
-        const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+        const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
         const __m128i vrem0123 =
           _mm_add_epi32(_mm_and_si128(vq31prod0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0123));
         const __m128i vrem4567 =
           _mm_add_epi32(_mm_and_si128(vq31prod4567, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod4567));
 
-        const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-        const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+        const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+        const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
         vacc0123 =
           _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
         vacc4567 =
           _mm_sub_epi32(_mm_sra_epi32(vq31prod4567, vshift), _mm_cmpgt_epi32(vrem4567, vremainder_threshold));
 
-        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
         __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
 
-        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-        vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
 
         __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
 
+        vout0123456701234567 = _mm_max_epi8(vout0123456701234567, _mm_load_si128((const __m128i*) params->sse4.output_min));
+        vout0123456701234567 = _mm_min_epi8(vout0123456701234567, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
         if XNN_LIKELY(c >= 8) {
           _mm_storel_epi64((__m128i*) output, vout0123456701234567);
           output += 8;
diff --git a/src/qs8-dwconv/gen/up24x25-minmax-avx-mul32.c b/src/qs8-dwconv/gen/up24x25-minmax-avx-mul32.c
index 4d40b5c..f32c5d5 100644
--- a/src/qs8-dwconv/gen/up24x25-minmax-avx-mul32.c
+++ b/src/qs8-dwconv/gen/up24x25-minmax-avx-mul32.c
@@ -696,8 +696,8 @@
 
       w = (const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 600 * sizeof(int8_t));
 
-      const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-      const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+      const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+      const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
       const __m128i vacc13 = _mm_shuffle_epi32(vacc0123, _MM_SHUFFLE(3, 3, 1, 1));
       const __m128i vprod02 = _mm_add_epi64(_mm_mul_epi32(vacc0123, vmultiplier), vrounding);
@@ -738,7 +738,7 @@
       const __m128i vq31prodGHIJ = _mm_blend_epi16(vq31prodGI, vq31prodHJ, 0xCC);
       const __m128i vq31prodKLMN = _mm_blend_epi16(vq31prodKM, vq31prodLN, 0xCC);
 
-      const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+      const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
       const __m128i vrem0123 =
         _mm_add_epi32(_mm_and_si128(vq31prod0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0123));
       const __m128i vrem4567 =
@@ -752,8 +752,8 @@
       const __m128i vremKLMN =
         _mm_add_epi32(_mm_and_si128(vq31prodKLMN, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prodKLMN));
 
-      const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-      const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+      const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+      const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
       vacc0123 =
         _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
       vacc4567 =
@@ -767,19 +767,19 @@
       vaccKLMN =
         _mm_sub_epi32(_mm_sra_epi32(vq31prodKLMN, vshift), _mm_cmpgt_epi32(vremKLMN, vremainder_threshold));
 
-      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
       __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
       __m128i voutGHIJKLMN = _mm_adds_epi16(_mm_packs_epi32(vaccGHIJ, vaccKLMN), voutput_zero_point);
 
-      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-      vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
-      vout89ABCDEF = _mm_min_epi16(_mm_max_epi16(vout89ABCDEF, voutput_min), voutput_max);
-      voutGHIJKLMN = _mm_min_epi16(_mm_max_epi16(voutGHIJKLMN, voutput_min), voutput_max);
-
+      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4.output_min);
+      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse4.output_max);
       __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF);
+      vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min);
+      vout0123456789ABCDEF = _mm_min_epi8(vout0123456789ABCDEF, voutput_max);
       __m128i voutGHIJKLMNGHIJKLMN = _mm_packs_epi16(voutGHIJKLMN, voutGHIJKLMN);
+      voutGHIJKLMNGHIJKLMN = _mm_max_epi8(voutGHIJKLMNGHIJKLMN, voutput_min);
+      voutGHIJKLMNGHIJKLMN = _mm_min_epi8(voutGHIJKLMNGHIJKLMN, voutput_max);
 
       _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
       _mm_storel_epi64((__m128i*) (output + 16), voutGHIJKLMNGHIJKLMN);
@@ -944,8 +944,8 @@
         w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
         k += 4;
 
-        const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-        const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+        const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+        const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
         const __m128i vacc13 = _mm_shuffle_epi32(vacc0123, _MM_SHUFFLE(3, 3, 1, 1));
 
@@ -957,23 +957,21 @@
 
         const __m128i vq31prod0123 = _mm_blend_epi16(vq31prod02, vq31prod13, 0xCC);
 
-        const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+        const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
         const __m128i vrem0123 =
           _mm_add_epi32(_mm_and_si128(vq31prod0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0123));
 
-        const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-        const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+        const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+        const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
         vacc0123 =
           _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
 
-        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
         __m128i vout0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc0123), voutput_zero_point);
 
-        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-        vout0123 = _mm_min_epi16(_mm_max_epi16(vout0123, voutput_min), voutput_max);
-
         vout0123 = _mm_packs_epi16(vout0123, vout0123);
+        vout0123 = _mm_max_epi8(vout0123, _mm_load_si128((const __m128i*) params->sse4.output_min));
+        vout0123 = _mm_min_epi8(vout0123, _mm_load_si128((const __m128i*) params->sse4.output_max));
 
         if XNN_LIKELY(c >= 4) {
           _mm_storeu_si32(output, vout0123);
diff --git a/src/qs8-dwconv/gen/up24x25-minmax-avx2-mul32.c b/src/qs8-dwconv/gen/up24x25-minmax-avx2-mul32.c
index d37b9aa..30157d0 100644
--- a/src/qs8-dwconv/gen/up24x25-minmax-avx2-mul32.c
+++ b/src/qs8-dwconv/gen/up24x25-minmax-avx2-mul32.c
@@ -467,8 +467,8 @@
 
       w = (const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 600 * sizeof(int8_t));
 
-      const __m256i vmultiplier = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.multiplier));
-      const __m256i vrounding = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.rounding));
+      const __m256i vmultiplier = _mm256_load_si256((const __m256i*) params->avx2.multiplier);
+      const __m256i vrounding = _mm256_load_si256((const __m256i*) params->avx2.rounding);
 
       const __m256i vacc1357 = _mm256_shuffle_epi32(vacc01234567, _MM_SHUFFLE(3, 3, 1, 1));
       const __m256i vacc9BDF = _mm256_shuffle_epi32(vacc89ABCDEF, _MM_SHUFFLE(3, 3, 1, 1));
@@ -492,7 +492,7 @@
       const __m256i vq31prod89ABCDEF = _mm256_blend_epi16(vq31prod8ACE, vq31prod9BDF, 0xCC);
       const __m256i vq31prodGHIJKLMN = _mm256_blend_epi16(vq31prodGIKM, vq31prodHJLN, 0xCC);
 
-      const __m256i vremainder_mask = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_mask));
+      const __m256i vremainder_mask = _mm256_load_si256((const __m256i*) params->avx2.remainder_mask);
       const __m256i vrem01234567 =
         _mm256_add_epi32(_mm256_and_si256(vq31prod01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod01234567));
       const __m256i vrem89ABCDEF =
@@ -500,8 +500,8 @@
       const __m256i vremGHIJKLMN =
         _mm256_add_epi32(_mm256_and_si256(vq31prodGHIJKLMN, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prodGHIJKLMN));
 
-      const __m256i vremainder_threshold = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_threshold));
-      const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+      const __m256i vremainder_threshold = _mm256_load_si256((const __m256i*) params->avx2.remainder_threshold);
+      const __m128i vshift = _mm_load_si128((const __m128i*) params->avx2.shift);
       vacc01234567 =
         _mm256_sub_epi32(_mm256_sra_epi32(vq31prod01234567, vshift), _mm256_cmpgt_epi32(vrem01234567, vremainder_threshold));
       vacc89ABCDEF =
@@ -509,18 +509,20 @@
       vaccGHIJKLMN =
         _mm256_sub_epi32(_mm256_sra_epi32(vq31prodGHIJKLMN, vshift), _mm256_cmpgt_epi32(vremGHIJKLMN, vremainder_threshold));
 
-      const __m256i voutput_zero_point = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_zero_point));
+      const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx2.output_zero_point);
       __m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(vacc01234567, vacc89ABCDEF), voutput_zero_point);
       __m128i voutGHIJKLMN = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vaccGHIJKLMN), _mm256_extracti128_si256(vaccGHIJKLMN, 1)), _mm256_castsi256_si128(voutput_zero_point));
 
-      const __m256i voutput_min = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_min));
-      const __m256i voutput_max = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_max));
-      vout012389AB4567CDEF = _mm256_min_epi16(_mm256_max_epi16(vout012389AB4567CDEF, voutput_min), voutput_max);
-      voutGHIJKLMN = _mm_min_epi16(_mm_max_epi16(voutGHIJKLMN, _mm256_castsi256_si128(voutput_min)), _mm256_castsi256_si128(voutput_max));
-
       __m128i vout0123456789ABCDEF = _mm_shuffle_epi32(_mm_packs_epi16(_mm256_castsi256_si128(vout012389AB4567CDEF), _mm256_extracti128_si256(vout012389AB4567CDEF, 1)), _MM_SHUFFLE(3, 1, 2, 0));
       __m128i voutGHIJKLMNGHIJKLMN = _mm_packs_epi16(voutGHIJKLMN, voutGHIJKLMN);
 
+      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx2.output_min);
+      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->avx2.output_max);
+      vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min);
+      vout0123456789ABCDEF = _mm_min_epi8(vout0123456789ABCDEF, voutput_max);
+      voutGHIJKLMNGHIJKLMN = _mm_max_epi8(voutGHIJKLMNGHIJKLMN, voutput_min);
+      voutGHIJKLMNGHIJKLMN = _mm_min_epi8(voutGHIJKLMNGHIJKLMN, voutput_max);
+
       _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
       _mm_storel_epi64((__m128i*) (output + 16), voutGHIJKLMNGHIJKLMN);
       output += 24;
@@ -684,8 +686,8 @@
         w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t));
         k += 8;
 
-        const __m256i vmultiplier = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.multiplier));
-        const __m256i vrounding = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.rounding));
+        const __m256i vmultiplier = _mm256_load_si256((const __m256i*) params->avx2.multiplier);
+        const __m256i vrounding = _mm256_load_si256((const __m256i*) params->avx2.rounding);
 
         const __m256i vacc1357 = _mm256_shuffle_epi32(vacc01234567, _MM_SHUFFLE(3, 3, 1, 1));
 
@@ -697,23 +699,23 @@
 
         const __m256i vq31prod01234567 = _mm256_blend_epi16(vq31prod0246, vq31prod1357, 0xCC);
 
-        const __m256i vremainder_mask = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_mask));
+        const __m256i vremainder_mask = _mm256_load_si256((const __m256i*) params->avx2.remainder_mask);
         const __m256i vrem01234567 =
           _mm256_add_epi32(_mm256_and_si256(vq31prod01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod01234567));
 
-        const __m256i vremainder_threshold = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_threshold));
-        const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+        const __m256i vremainder_threshold = _mm256_load_si256((const __m256i*) params->avx2.remainder_threshold);
+        const __m128i vshift = _mm_load_si128((const __m128i*) params->avx2.shift);
         vacc01234567 =
           _mm256_sub_epi32(_mm256_sra_epi32(vq31prod01234567, vshift), _mm256_cmpgt_epi32(vrem01234567, vremainder_threshold));
 
-        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->avx2.output_zero_point);
         __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extracti128_si256(vacc01234567, 1)), voutput_zero_point);
 
-        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-        vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
-
         __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
+        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->avx2.output_max);
+        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx2.output_min);
+        vout0123456701234567 = _mm_min_epi8(vout0123456701234567, voutput_max);
+        vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
 
         if XNN_LIKELY(c >= 8) {
           _mm_storel_epi64((__m128i*) output, vout0123456701234567);
diff --git a/src/qs8-dwconv/gen/up24x25-minmax-sse2-mul16.c b/src/qs8-dwconv/gen/up24x25-minmax-sse2-mul16.c
index c58b6a0..8ba83d2 100644
--- a/src/qs8-dwconv/gen/up24x25-minmax-sse2-mul16.c
+++ b/src/qs8-dwconv/gen/up24x25-minmax-sse2-mul16.c
@@ -1005,7 +1005,7 @@
         _mm_add_epi32(_mm_and_si128(vq31prodKLMN, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prodKLMN));
 
       const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-      const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+      const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse2.shift);
       vacc0123 =
         _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
       vacc4567 =
@@ -1025,14 +1025,19 @@
       __m128i voutGHIJKLMN = _mm_adds_epi16(_mm_packs_epi32(vaccGHIJ, vaccKLMN), voutput_zero_point);
 
       const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
+      vout01234567 = _mm_max_epi16(vout01234567, voutput_min);
+      vout89ABCDEF = _mm_max_epi16(vout89ABCDEF, voutput_min);
+      voutGHIJKLMN = _mm_max_epi16(voutGHIJKLMN, voutput_min);
+
       const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-      vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
-      vout89ABCDEF = _mm_min_epi16(_mm_max_epi16(vout89ABCDEF, voutput_min), voutput_max);
-      voutGHIJKLMN = _mm_min_epi16(_mm_max_epi16(voutGHIJKLMN, voutput_min), voutput_max);
+      vout01234567 = _mm_min_epi16(vout01234567, voutput_max);
+      vout89ABCDEF = _mm_min_epi16(vout89ABCDEF, voutput_max);
+      voutGHIJKLMN = _mm_min_epi16(voutGHIJKLMN, voutput_max);
 
       __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF);
       __m128i voutGHIJKLMNGHIJKLMN = _mm_packs_epi16(voutGHIJKLMN, voutGHIJKLMN);
 
+
       _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
       _mm_storel_epi64((__m128i*) (output + 16), voutGHIJKLMNGHIJKLMN);
       output += 24;
@@ -1419,7 +1424,7 @@
           _mm_add_epi32(_mm_and_si128(vq31prod4567, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod4567));
 
         const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-        const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+        const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse2.shift);
         vacc0123 =
           _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
         vacc4567 =
@@ -1428,12 +1433,12 @@
         const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
         __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
 
-        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-        vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
+        vout01234567 = _mm_max_epi16(vout01234567, _mm_load_si128((const __m128i*) params->sse2.output_min));
+        vout01234567 = _mm_min_epi16(vout01234567, _mm_load_si128((const __m128i*) params->sse2.output_max));
 
         __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
 
+
         if XNN_LIKELY(c >= 8) {
           _mm_storel_epi64((__m128i*) output, vout0123456701234567);
           output += 8;
diff --git a/src/qs8-dwconv/gen/up24x25-minmax-sse41-mul16.c b/src/qs8-dwconv/gen/up24x25-minmax-sse41-mul16.c
index 5cc20c6..d0069bb 100644
--- a/src/qs8-dwconv/gen/up24x25-minmax-sse41-mul16.c
+++ b/src/qs8-dwconv/gen/up24x25-minmax-sse41-mul16.c
@@ -895,8 +895,8 @@
 
       w = (const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 600 * sizeof(int8_t));
 
-      const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-      const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+      const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+      const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
       const __m128i vacc13 = _mm_shuffle_epi32(vacc0123, _MM_SHUFFLE(3, 3, 1, 1));
       const __m128i vprod02 = _mm_add_epi64(_mm_mul_epi32(vacc0123, vmultiplier), vrounding);
@@ -937,7 +937,7 @@
       const __m128i vq31prodGHIJ = _mm_blend_epi16(vq31prodGI, vq31prodHJ, 0xCC);
       const __m128i vq31prodKLMN = _mm_blend_epi16(vq31prodKM, vq31prodLN, 0xCC);
 
-      const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+      const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
       const __m128i vrem0123 =
         _mm_add_epi32(_mm_and_si128(vq31prod0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0123));
       const __m128i vrem4567 =
@@ -951,8 +951,8 @@
       const __m128i vremKLMN =
         _mm_add_epi32(_mm_and_si128(vq31prodKLMN, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prodKLMN));
 
-      const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-      const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+      const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+      const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
       vacc0123 =
         _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
       vacc4567 =
@@ -966,20 +966,23 @@
       vaccKLMN =
         _mm_sub_epi32(_mm_sra_epi32(vq31prodKLMN, vshift), _mm_cmpgt_epi32(vremKLMN, vremainder_threshold));
 
-      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
       __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
       __m128i voutGHIJKLMN = _mm_adds_epi16(_mm_packs_epi32(vaccGHIJ, vaccKLMN), voutput_zero_point);
 
-      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-      vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
-      vout89ABCDEF = _mm_min_epi16(_mm_max_epi16(vout89ABCDEF, voutput_min), voutput_max);
-      voutGHIJKLMN = _mm_min_epi16(_mm_max_epi16(voutGHIJKLMN, voutput_min), voutput_max);
 
       __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF);
       __m128i voutGHIJKLMNGHIJKLMN = _mm_packs_epi16(voutGHIJKLMN, voutGHIJKLMN);
 
+      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4.output_min);
+      vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min);
+      voutGHIJKLMNGHIJKLMN = _mm_max_epi8(voutGHIJKLMNGHIJKLMN, voutput_min);
+
+      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse4.output_max);
+      vout0123456789ABCDEF = _mm_min_epi8(vout0123456789ABCDEF, voutput_max);
+      voutGHIJKLMNGHIJKLMN = _mm_min_epi8(voutGHIJKLMNGHIJKLMN, voutput_max);
+
       _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
       _mm_storel_epi64((__m128i*) (output + 16), voutGHIJKLMNGHIJKLMN);
       output += 24;
@@ -1319,8 +1322,8 @@
         w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t));
         k += 8;
 
-        const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-        const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+        const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+        const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
         const __m128i vacc13 = _mm_shuffle_epi32(vacc0123, _MM_SHUFFLE(3, 3, 1, 1));
         const __m128i vacc57 = _mm_shuffle_epi32(vacc4567, _MM_SHUFFLE(3, 3, 1, 1));
@@ -1339,28 +1342,28 @@
         const __m128i vq31prod0123 = _mm_blend_epi16(vq31prod02, vq31prod13, 0xCC);
         const __m128i vq31prod4567 = _mm_blend_epi16(vq31prod46, vq31prod57, 0xCC);
 
-        const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+        const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
         const __m128i vrem0123 =
           _mm_add_epi32(_mm_and_si128(vq31prod0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0123));
         const __m128i vrem4567 =
           _mm_add_epi32(_mm_and_si128(vq31prod4567, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod4567));
 
-        const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-        const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+        const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+        const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
         vacc0123 =
           _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
         vacc4567 =
           _mm_sub_epi32(_mm_sra_epi32(vq31prod4567, vshift), _mm_cmpgt_epi32(vrem4567, vremainder_threshold));
 
-        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
         __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
 
-        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-        vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
 
         __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
 
+        vout0123456701234567 = _mm_max_epi8(vout0123456701234567, _mm_load_si128((const __m128i*) params->sse4.output_min));
+        vout0123456701234567 = _mm_min_epi8(vout0123456701234567, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
         if XNN_LIKELY(c >= 8) {
           _mm_storel_epi64((__m128i*) output, vout0123456701234567);
           output += 8;
diff --git a/src/qs8-dwconv/gen/up24x25-minmax-sse41-mul32.c b/src/qs8-dwconv/gen/up24x25-minmax-sse41-mul32.c
index 4ad2e4f..3309700 100644
--- a/src/qs8-dwconv/gen/up24x25-minmax-sse41-mul32.c
+++ b/src/qs8-dwconv/gen/up24x25-minmax-sse41-mul32.c
@@ -696,8 +696,8 @@
 
       w = (const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 600 * sizeof(int8_t));
 
-      const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-      const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+      const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+      const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
       const __m128i vacc13 = _mm_shuffle_epi32(vacc0123, _MM_SHUFFLE(3, 3, 1, 1));
       const __m128i vprod02 = _mm_add_epi64(_mm_mul_epi32(vacc0123, vmultiplier), vrounding);
@@ -738,7 +738,7 @@
       const __m128i vq31prodGHIJ = _mm_blend_epi16(vq31prodGI, vq31prodHJ, 0xCC);
       const __m128i vq31prodKLMN = _mm_blend_epi16(vq31prodKM, vq31prodLN, 0xCC);
 
-      const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+      const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
       const __m128i vrem0123 =
         _mm_add_epi32(_mm_and_si128(vq31prod0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0123));
       const __m128i vrem4567 =
@@ -752,8 +752,8 @@
       const __m128i vremKLMN =
         _mm_add_epi32(_mm_and_si128(vq31prodKLMN, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prodKLMN));
 
-      const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-      const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+      const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+      const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
       vacc0123 =
         _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
       vacc4567 =
@@ -767,19 +767,19 @@
       vaccKLMN =
         _mm_sub_epi32(_mm_sra_epi32(vq31prodKLMN, vshift), _mm_cmpgt_epi32(vremKLMN, vremainder_threshold));
 
-      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
       __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
       __m128i voutGHIJKLMN = _mm_adds_epi16(_mm_packs_epi32(vaccGHIJ, vaccKLMN), voutput_zero_point);
 
-      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-      vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
-      vout89ABCDEF = _mm_min_epi16(_mm_max_epi16(vout89ABCDEF, voutput_min), voutput_max);
-      voutGHIJKLMN = _mm_min_epi16(_mm_max_epi16(voutGHIJKLMN, voutput_min), voutput_max);
-
+      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4.output_min);
+      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse4.output_max);
       __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF);
+      vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min);
+      vout0123456789ABCDEF = _mm_min_epi8(vout0123456789ABCDEF, voutput_max);
       __m128i voutGHIJKLMNGHIJKLMN = _mm_packs_epi16(voutGHIJKLMN, voutGHIJKLMN);
+      voutGHIJKLMNGHIJKLMN = _mm_max_epi8(voutGHIJKLMNGHIJKLMN, voutput_min);
+      voutGHIJKLMNGHIJKLMN = _mm_min_epi8(voutGHIJKLMNGHIJKLMN, voutput_max);
 
       _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
       _mm_storel_epi64((__m128i*) (output + 16), voutGHIJKLMNGHIJKLMN);
@@ -944,8 +944,8 @@
         w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
         k += 4;
 
-        const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-        const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+        const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+        const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
         const __m128i vacc13 = _mm_shuffle_epi32(vacc0123, _MM_SHUFFLE(3, 3, 1, 1));
 
@@ -957,23 +957,21 @@
 
         const __m128i vq31prod0123 = _mm_blend_epi16(vq31prod02, vq31prod13, 0xCC);
 
-        const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+        const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
         const __m128i vrem0123 =
           _mm_add_epi32(_mm_and_si128(vq31prod0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0123));
 
-        const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-        const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+        const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+        const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
         vacc0123 =
           _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
 
-        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
         __m128i vout0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc0123), voutput_zero_point);
 
-        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-        vout0123 = _mm_min_epi16(_mm_max_epi16(vout0123, voutput_min), voutput_max);
-
         vout0123 = _mm_packs_epi16(vout0123, vout0123);
+        vout0123 = _mm_max_epi8(vout0123, _mm_load_si128((const __m128i*) params->sse4.output_min));
+        vout0123 = _mm_min_epi8(vout0123, _mm_load_si128((const __m128i*) params->sse4.output_max));
 
         if XNN_LIKELY(c >= 4) {
           _mm_storeu_si32(output, vout0123);
diff --git a/src/qs8-dwconv/gen/up24x25-minmax-ssse3-mul16.c b/src/qs8-dwconv/gen/up24x25-minmax-ssse3-mul16.c
index 358b3c3..499ab88 100644
--- a/src/qs8-dwconv/gen/up24x25-minmax-ssse3-mul16.c
+++ b/src/qs8-dwconv/gen/up24x25-minmax-ssse3-mul16.c
@@ -1005,7 +1005,7 @@
         _mm_add_epi32(_mm_and_si128(vq31prodKLMN, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prodKLMN));
 
       const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-      const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+      const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse2.shift);
       vacc0123 =
         _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
       vacc4567 =
@@ -1025,14 +1025,19 @@
       __m128i voutGHIJKLMN = _mm_adds_epi16(_mm_packs_epi32(vaccGHIJ, vaccKLMN), voutput_zero_point);
 
       const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
+      vout01234567 = _mm_max_epi16(vout01234567, voutput_min);
+      vout89ABCDEF = _mm_max_epi16(vout89ABCDEF, voutput_min);
+      voutGHIJKLMN = _mm_max_epi16(voutGHIJKLMN, voutput_min);
+
       const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-      vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
-      vout89ABCDEF = _mm_min_epi16(_mm_max_epi16(vout89ABCDEF, voutput_min), voutput_max);
-      voutGHIJKLMN = _mm_min_epi16(_mm_max_epi16(voutGHIJKLMN, voutput_min), voutput_max);
+      vout01234567 = _mm_min_epi16(vout01234567, voutput_max);
+      vout89ABCDEF = _mm_min_epi16(vout89ABCDEF, voutput_max);
+      voutGHIJKLMN = _mm_min_epi16(voutGHIJKLMN, voutput_max);
 
       __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF);
       __m128i voutGHIJKLMNGHIJKLMN = _mm_packs_epi16(voutGHIJKLMN, voutGHIJKLMN);
 
+
       _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
       _mm_storel_epi64((__m128i*) (output + 16), voutGHIJKLMNGHIJKLMN);
       output += 24;
@@ -1419,7 +1424,7 @@
           _mm_add_epi32(_mm_and_si128(vq31prod4567, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod4567));
 
         const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-        const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+        const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse2.shift);
         vacc0123 =
           _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
         vacc4567 =
@@ -1428,12 +1433,12 @@
         const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
         __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
 
-        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-        vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
+        vout01234567 = _mm_max_epi16(vout01234567, _mm_load_si128((const __m128i*) params->sse2.output_min));
+        vout01234567 = _mm_min_epi16(vout01234567, _mm_load_si128((const __m128i*) params->sse2.output_max));
 
         __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
 
+
         if XNN_LIKELY(c >= 8) {
           _mm_storel_epi64((__m128i*) output, vout0123456701234567);
           output += 8;
diff --git a/src/qs8-dwconv/gen/up24x25-minmax-xop-mul32.c b/src/qs8-dwconv/gen/up24x25-minmax-xop-mul32.c
index 91838ed..fae92d2 100644
--- a/src/qs8-dwconv/gen/up24x25-minmax-xop-mul32.c
+++ b/src/qs8-dwconv/gen/up24x25-minmax-xop-mul32.c
@@ -701,8 +701,8 @@
 
       w = (const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 600 * sizeof(int8_t));
 
-      const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-      const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+      const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+      const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
       const __m128i vacc13 = _mm_shuffle_epi32(vacc0123, _MM_SHUFFLE(3, 3, 1, 1));
       const __m128i vprod02 = _mm_add_epi64(_mm_mul_epi32(vacc0123, vmultiplier), vrounding);
@@ -743,7 +743,7 @@
       const __m128i vq31prodGHIJ = _mm_blend_epi16(vq31prodGI, vq31prodHJ, 0xCC);
       const __m128i vq31prodKLMN = _mm_blend_epi16(vq31prodKM, vq31prodLN, 0xCC);
 
-      const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+      const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
       const __m128i vrem0123 =
         _mm_add_epi32(_mm_and_si128(vq31prod0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0123));
       const __m128i vrem4567 =
@@ -757,8 +757,8 @@
       const __m128i vremKLMN =
         _mm_add_epi32(_mm_and_si128(vq31prodKLMN, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prodKLMN));
 
-      const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-      const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+      const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+      const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
       vacc0123 =
         _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
       vacc4567 =
@@ -772,19 +772,19 @@
       vaccKLMN =
         _mm_sub_epi32(_mm_sra_epi32(vq31prodKLMN, vshift), _mm_cmpgt_epi32(vremKLMN, vremainder_threshold));
 
-      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
       __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
       __m128i voutGHIJKLMN = _mm_adds_epi16(_mm_packs_epi32(vaccGHIJ, vaccKLMN), voutput_zero_point);
 
-      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-      vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
-      vout89ABCDEF = _mm_min_epi16(_mm_max_epi16(vout89ABCDEF, voutput_min), voutput_max);
-      voutGHIJKLMN = _mm_min_epi16(_mm_max_epi16(voutGHIJKLMN, voutput_min), voutput_max);
-
+      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4.output_min);
+      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse4.output_max);
       __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF);
+      vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min);
+      vout0123456789ABCDEF = _mm_min_epi8(vout0123456789ABCDEF, voutput_max);
       __m128i voutGHIJKLMNGHIJKLMN = _mm_packs_epi16(voutGHIJKLMN, voutGHIJKLMN);
+      voutGHIJKLMNGHIJKLMN = _mm_max_epi8(voutGHIJKLMNGHIJKLMN, voutput_min);
+      voutGHIJKLMNGHIJKLMN = _mm_min_epi8(voutGHIJKLMNGHIJKLMN, voutput_max);
 
       _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
       _mm_storel_epi64((__m128i*) (output + 16), voutGHIJKLMNGHIJKLMN);
@@ -949,8 +949,8 @@
         w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
         k += 4;
 
-        const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-        const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+        const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+        const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
         const __m128i vacc13 = _mm_shuffle_epi32(vacc0123, _MM_SHUFFLE(3, 3, 1, 1));
 
@@ -962,23 +962,21 @@
 
         const __m128i vq31prod0123 = _mm_blend_epi16(vq31prod02, vq31prod13, 0xCC);
 
-        const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+        const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
         const __m128i vrem0123 =
           _mm_add_epi32(_mm_and_si128(vq31prod0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0123));
 
-        const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-        const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+        const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+        const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
         vacc0123 =
           _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
 
-        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
         __m128i vout0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc0123), voutput_zero_point);
 
-        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-        vout0123 = _mm_min_epi16(_mm_max_epi16(vout0123, voutput_min), voutput_max);
-
         vout0123 = _mm_packs_epi16(vout0123, vout0123);
+        vout0123 = _mm_max_epi8(vout0123, _mm_load_si128((const __m128i*) params->sse4.output_min));
+        vout0123 = _mm_min_epi8(vout0123, _mm_load_si128((const __m128i*) params->sse4.output_max));
 
         if XNN_LIKELY(c >= 4) {
           _mm_storeu_si32(output, vout0123);
diff --git a/src/qs8-dwconv/gen/up24x9-minmax-avx-mul16.c b/src/qs8-dwconv/gen/up24x9-minmax-avx-mul16.c
index ad996c3..1582948 100644
--- a/src/qs8-dwconv/gen/up24x9-minmax-avx-mul16.c
+++ b/src/qs8-dwconv/gen/up24x9-minmax-avx-mul16.c
@@ -351,8 +351,8 @@
 
       w = (const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 216 * sizeof(int8_t));
 
-      const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-      const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+      const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+      const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
       const __m128i vacc13 = _mm_shuffle_epi32(vacc0123, _MM_SHUFFLE(3, 3, 1, 1));
       const __m128i vprod02 = _mm_add_epi64(_mm_mul_epi32(vacc0123, vmultiplier), vrounding);
@@ -393,7 +393,7 @@
       const __m128i vq31prodGHIJ = _mm_blend_epi16(vq31prodGI, vq31prodHJ, 0xCC);
       const __m128i vq31prodKLMN = _mm_blend_epi16(vq31prodKM, vq31prodLN, 0xCC);
 
-      const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+      const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
       const __m128i vrem0123 =
         _mm_add_epi32(_mm_and_si128(vq31prod0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0123));
       const __m128i vrem4567 =
@@ -407,8 +407,8 @@
       const __m128i vremKLMN =
         _mm_add_epi32(_mm_and_si128(vq31prodKLMN, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prodKLMN));
 
-      const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-      const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+      const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+      const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
       vacc0123 =
         _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
       vacc4567 =
@@ -422,20 +422,23 @@
       vaccKLMN =
         _mm_sub_epi32(_mm_sra_epi32(vq31prodKLMN, vshift), _mm_cmpgt_epi32(vremKLMN, vremainder_threshold));
 
-      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
       __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
       __m128i voutGHIJKLMN = _mm_adds_epi16(_mm_packs_epi32(vaccGHIJ, vaccKLMN), voutput_zero_point);
 
-      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-      vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
-      vout89ABCDEF = _mm_min_epi16(_mm_max_epi16(vout89ABCDEF, voutput_min), voutput_max);
-      voutGHIJKLMN = _mm_min_epi16(_mm_max_epi16(voutGHIJKLMN, voutput_min), voutput_max);
 
       __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF);
       __m128i voutGHIJKLMNGHIJKLMN = _mm_packs_epi16(voutGHIJKLMN, voutGHIJKLMN);
 
+      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4.output_min);
+      vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min);
+      voutGHIJKLMNGHIJKLMN = _mm_max_epi8(voutGHIJKLMNGHIJKLMN, voutput_min);
+
+      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse4.output_max);
+      vout0123456789ABCDEF = _mm_min_epi8(vout0123456789ABCDEF, voutput_max);
+      voutGHIJKLMNGHIJKLMN = _mm_min_epi8(voutGHIJKLMNGHIJKLMN, voutput_max);
+
       _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
       _mm_storel_epi64((__m128i*) (output + 16), voutGHIJKLMNGHIJKLMN);
       output += 24;
@@ -567,8 +570,8 @@
         w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t));
         k += 8;
 
-        const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-        const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+        const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+        const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
         const __m128i vacc13 = _mm_shuffle_epi32(vacc0123, _MM_SHUFFLE(3, 3, 1, 1));
         const __m128i vacc57 = _mm_shuffle_epi32(vacc4567, _MM_SHUFFLE(3, 3, 1, 1));
@@ -587,28 +590,28 @@
         const __m128i vq31prod0123 = _mm_blend_epi16(vq31prod02, vq31prod13, 0xCC);
         const __m128i vq31prod4567 = _mm_blend_epi16(vq31prod46, vq31prod57, 0xCC);
 
-        const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+        const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
         const __m128i vrem0123 =
           _mm_add_epi32(_mm_and_si128(vq31prod0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0123));
         const __m128i vrem4567 =
           _mm_add_epi32(_mm_and_si128(vq31prod4567, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod4567));
 
-        const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-        const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+        const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+        const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
         vacc0123 =
           _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
         vacc4567 =
           _mm_sub_epi32(_mm_sra_epi32(vq31prod4567, vshift), _mm_cmpgt_epi32(vrem4567, vremainder_threshold));
 
-        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
         __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
 
-        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-        vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
 
         __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
 
+        vout0123456701234567 = _mm_max_epi8(vout0123456701234567, _mm_load_si128((const __m128i*) params->sse4.output_min));
+        vout0123456701234567 = _mm_min_epi8(vout0123456701234567, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
         if XNN_LIKELY(c >= 8) {
           _mm_storel_epi64((__m128i*) output, vout0123456701234567);
           output += 8;
diff --git a/src/qs8-dwconv/gen/up24x9-minmax-avx-mul32.c b/src/qs8-dwconv/gen/up24x9-minmax-avx-mul32.c
index 8569133..32bb9dd 100644
--- a/src/qs8-dwconv/gen/up24x9-minmax-avx-mul32.c
+++ b/src/qs8-dwconv/gen/up24x9-minmax-avx-mul32.c
@@ -280,8 +280,8 @@
 
       w = (const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 216 * sizeof(int8_t));
 
-      const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-      const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+      const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+      const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
       const __m128i vacc13 = _mm_shuffle_epi32(vacc0123, _MM_SHUFFLE(3, 3, 1, 1));
       const __m128i vprod02 = _mm_add_epi64(_mm_mul_epi32(vacc0123, vmultiplier), vrounding);
@@ -322,7 +322,7 @@
       const __m128i vq31prodGHIJ = _mm_blend_epi16(vq31prodGI, vq31prodHJ, 0xCC);
       const __m128i vq31prodKLMN = _mm_blend_epi16(vq31prodKM, vq31prodLN, 0xCC);
 
-      const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+      const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
       const __m128i vrem0123 =
         _mm_add_epi32(_mm_and_si128(vq31prod0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0123));
       const __m128i vrem4567 =
@@ -336,8 +336,8 @@
       const __m128i vremKLMN =
         _mm_add_epi32(_mm_and_si128(vq31prodKLMN, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prodKLMN));
 
-      const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-      const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+      const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+      const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
       vacc0123 =
         _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
       vacc4567 =
@@ -351,19 +351,19 @@
       vaccKLMN =
         _mm_sub_epi32(_mm_sra_epi32(vq31prodKLMN, vshift), _mm_cmpgt_epi32(vremKLMN, vremainder_threshold));
 
-      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
       __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
       __m128i voutGHIJKLMN = _mm_adds_epi16(_mm_packs_epi32(vaccGHIJ, vaccKLMN), voutput_zero_point);
 
-      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-      vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
-      vout89ABCDEF = _mm_min_epi16(_mm_max_epi16(vout89ABCDEF, voutput_min), voutput_max);
-      voutGHIJKLMN = _mm_min_epi16(_mm_max_epi16(voutGHIJKLMN, voutput_min), voutput_max);
-
+      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4.output_min);
+      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse4.output_max);
       __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF);
+      vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min);
+      vout0123456789ABCDEF = _mm_min_epi8(vout0123456789ABCDEF, voutput_max);
       __m128i voutGHIJKLMNGHIJKLMN = _mm_packs_epi16(voutGHIJKLMN, voutGHIJKLMN);
+      voutGHIJKLMNGHIJKLMN = _mm_max_epi8(voutGHIJKLMNGHIJKLMN, voutput_min);
+      voutGHIJKLMNGHIJKLMN = _mm_min_epi8(voutGHIJKLMNGHIJKLMN, voutput_max);
 
       _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
       _mm_storel_epi64((__m128i*) (output + 16), voutGHIJKLMNGHIJKLMN);
@@ -432,8 +432,8 @@
         w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
         k += 4;
 
-        const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-        const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+        const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+        const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
         const __m128i vacc13 = _mm_shuffle_epi32(vacc0123, _MM_SHUFFLE(3, 3, 1, 1));
 
@@ -445,23 +445,21 @@
 
         const __m128i vq31prod0123 = _mm_blend_epi16(vq31prod02, vq31prod13, 0xCC);
 
-        const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+        const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
         const __m128i vrem0123 =
           _mm_add_epi32(_mm_and_si128(vq31prod0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0123));
 
-        const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-        const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+        const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+        const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
         vacc0123 =
           _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
 
-        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
         __m128i vout0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc0123), voutput_zero_point);
 
-        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-        vout0123 = _mm_min_epi16(_mm_max_epi16(vout0123, voutput_min), voutput_max);
-
         vout0123 = _mm_packs_epi16(vout0123, vout0123);
+        vout0123 = _mm_max_epi8(vout0123, _mm_load_si128((const __m128i*) params->sse4.output_min));
+        vout0123 = _mm_min_epi8(vout0123, _mm_load_si128((const __m128i*) params->sse4.output_max));
 
         if XNN_LIKELY(c >= 4) {
           _mm_storeu_si32(output, vout0123);
diff --git a/src/qs8-dwconv/gen/up24x9-minmax-avx2-mul32.c b/src/qs8-dwconv/gen/up24x9-minmax-avx2-mul32.c
index f04663f..26221c8 100644
--- a/src/qs8-dwconv/gen/up24x9-minmax-avx2-mul32.c
+++ b/src/qs8-dwconv/gen/up24x9-minmax-avx2-mul32.c
@@ -195,8 +195,8 @@
 
       w = (const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 216 * sizeof(int8_t));
 
-      const __m256i vmultiplier = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.multiplier));
-      const __m256i vrounding = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.rounding));
+      const __m256i vmultiplier = _mm256_load_si256((const __m256i*) params->avx2.multiplier);
+      const __m256i vrounding = _mm256_load_si256((const __m256i*) params->avx2.rounding);
 
       const __m256i vacc1357 = _mm256_shuffle_epi32(vacc01234567, _MM_SHUFFLE(3, 3, 1, 1));
       const __m256i vacc9BDF = _mm256_shuffle_epi32(vacc89ABCDEF, _MM_SHUFFLE(3, 3, 1, 1));
@@ -220,7 +220,7 @@
       const __m256i vq31prod89ABCDEF = _mm256_blend_epi16(vq31prod8ACE, vq31prod9BDF, 0xCC);
       const __m256i vq31prodGHIJKLMN = _mm256_blend_epi16(vq31prodGIKM, vq31prodHJLN, 0xCC);
 
-      const __m256i vremainder_mask = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_mask));
+      const __m256i vremainder_mask = _mm256_load_si256((const __m256i*) params->avx2.remainder_mask);
       const __m256i vrem01234567 =
         _mm256_add_epi32(_mm256_and_si256(vq31prod01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod01234567));
       const __m256i vrem89ABCDEF =
@@ -228,8 +228,8 @@
       const __m256i vremGHIJKLMN =
         _mm256_add_epi32(_mm256_and_si256(vq31prodGHIJKLMN, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prodGHIJKLMN));
 
-      const __m256i vremainder_threshold = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_threshold));
-      const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+      const __m256i vremainder_threshold = _mm256_load_si256((const __m256i*) params->avx2.remainder_threshold);
+      const __m128i vshift = _mm_load_si128((const __m128i*) params->avx2.shift);
       vacc01234567 =
         _mm256_sub_epi32(_mm256_sra_epi32(vq31prod01234567, vshift), _mm256_cmpgt_epi32(vrem01234567, vremainder_threshold));
       vacc89ABCDEF =
@@ -237,18 +237,20 @@
       vaccGHIJKLMN =
         _mm256_sub_epi32(_mm256_sra_epi32(vq31prodGHIJKLMN, vshift), _mm256_cmpgt_epi32(vremGHIJKLMN, vremainder_threshold));
 
-      const __m256i voutput_zero_point = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_zero_point));
+      const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx2.output_zero_point);
       __m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(vacc01234567, vacc89ABCDEF), voutput_zero_point);
       __m128i voutGHIJKLMN = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vaccGHIJKLMN), _mm256_extracti128_si256(vaccGHIJKLMN, 1)), _mm256_castsi256_si128(voutput_zero_point));
 
-      const __m256i voutput_min = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_min));
-      const __m256i voutput_max = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_max));
-      vout012389AB4567CDEF = _mm256_min_epi16(_mm256_max_epi16(vout012389AB4567CDEF, voutput_min), voutput_max);
-      voutGHIJKLMN = _mm_min_epi16(_mm_max_epi16(voutGHIJKLMN, _mm256_castsi256_si128(voutput_min)), _mm256_castsi256_si128(voutput_max));
-
       __m128i vout0123456789ABCDEF = _mm_shuffle_epi32(_mm_packs_epi16(_mm256_castsi256_si128(vout012389AB4567CDEF), _mm256_extracti128_si256(vout012389AB4567CDEF, 1)), _MM_SHUFFLE(3, 1, 2, 0));
       __m128i voutGHIJKLMNGHIJKLMN = _mm_packs_epi16(voutGHIJKLMN, voutGHIJKLMN);
 
+      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx2.output_min);
+      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->avx2.output_max);
+      vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min);
+      vout0123456789ABCDEF = _mm_min_epi8(vout0123456789ABCDEF, voutput_max);
+      voutGHIJKLMNGHIJKLMN = _mm_max_epi8(voutGHIJKLMNGHIJKLMN, voutput_min);
+      voutGHIJKLMNGHIJKLMN = _mm_min_epi8(voutGHIJKLMNGHIJKLMN, voutput_max);
+
       _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
       _mm_storel_epi64((__m128i*) (output + 16), voutGHIJKLMNGHIJKLMN);
       output += 24;
@@ -316,8 +318,8 @@
         w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t));
         k += 8;
 
-        const __m256i vmultiplier = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.multiplier));
-        const __m256i vrounding = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.rounding));
+        const __m256i vmultiplier = _mm256_load_si256((const __m256i*) params->avx2.multiplier);
+        const __m256i vrounding = _mm256_load_si256((const __m256i*) params->avx2.rounding);
 
         const __m256i vacc1357 = _mm256_shuffle_epi32(vacc01234567, _MM_SHUFFLE(3, 3, 1, 1));
 
@@ -329,23 +331,23 @@
 
         const __m256i vq31prod01234567 = _mm256_blend_epi16(vq31prod0246, vq31prod1357, 0xCC);
 
-        const __m256i vremainder_mask = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_mask));
+        const __m256i vremainder_mask = _mm256_load_si256((const __m256i*) params->avx2.remainder_mask);
         const __m256i vrem01234567 =
           _mm256_add_epi32(_mm256_and_si256(vq31prod01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod01234567));
 
-        const __m256i vremainder_threshold = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_threshold));
-        const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+        const __m256i vremainder_threshold = _mm256_load_si256((const __m256i*) params->avx2.remainder_threshold);
+        const __m128i vshift = _mm_load_si128((const __m128i*) params->avx2.shift);
         vacc01234567 =
           _mm256_sub_epi32(_mm256_sra_epi32(vq31prod01234567, vshift), _mm256_cmpgt_epi32(vrem01234567, vremainder_threshold));
 
-        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->avx2.output_zero_point);
         __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extracti128_si256(vacc01234567, 1)), voutput_zero_point);
 
-        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-        vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
-
         __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
+        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->avx2.output_max);
+        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx2.output_min);
+        vout0123456701234567 = _mm_min_epi8(vout0123456701234567, voutput_max);
+        vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
 
         if XNN_LIKELY(c >= 8) {
           _mm_storel_epi64((__m128i*) output, vout0123456701234567);
diff --git a/src/qs8-dwconv/gen/up24x9-minmax-sse2-mul16.c b/src/qs8-dwconv/gen/up24x9-minmax-sse2-mul16.c
index 8eda6f8..c2a24ed 100644
--- a/src/qs8-dwconv/gen/up24x9-minmax-sse2-mul16.c
+++ b/src/qs8-dwconv/gen/up24x9-minmax-sse2-mul16.c
@@ -461,7 +461,7 @@
         _mm_add_epi32(_mm_and_si128(vq31prodKLMN, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prodKLMN));
 
       const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-      const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+      const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse2.shift);
       vacc0123 =
         _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
       vacc4567 =
@@ -481,14 +481,19 @@
       __m128i voutGHIJKLMN = _mm_adds_epi16(_mm_packs_epi32(vaccGHIJ, vaccKLMN), voutput_zero_point);
 
       const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
+      vout01234567 = _mm_max_epi16(vout01234567, voutput_min);
+      vout89ABCDEF = _mm_max_epi16(vout89ABCDEF, voutput_min);
+      voutGHIJKLMN = _mm_max_epi16(voutGHIJKLMN, voutput_min);
+
       const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-      vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
-      vout89ABCDEF = _mm_min_epi16(_mm_max_epi16(vout89ABCDEF, voutput_min), voutput_max);
-      voutGHIJKLMN = _mm_min_epi16(_mm_max_epi16(voutGHIJKLMN, voutput_min), voutput_max);
+      vout01234567 = _mm_min_epi16(vout01234567, voutput_max);
+      vout89ABCDEF = _mm_min_epi16(vout89ABCDEF, voutput_max);
+      voutGHIJKLMN = _mm_min_epi16(voutGHIJKLMN, voutput_max);
 
       __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF);
       __m128i voutGHIJKLMNGHIJKLMN = _mm_packs_epi16(voutGHIJKLMN, voutGHIJKLMN);
 
+
       _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
       _mm_storel_epi64((__m128i*) (output + 16), voutGHIJKLMNGHIJKLMN);
       output += 24;
@@ -667,7 +672,7 @@
           _mm_add_epi32(_mm_and_si128(vq31prod4567, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod4567));
 
         const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-        const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+        const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse2.shift);
         vacc0123 =
           _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
         vacc4567 =
@@ -676,12 +681,12 @@
         const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
         __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
 
-        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-        vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
+        vout01234567 = _mm_max_epi16(vout01234567, _mm_load_si128((const __m128i*) params->sse2.output_min));
+        vout01234567 = _mm_min_epi16(vout01234567, _mm_load_si128((const __m128i*) params->sse2.output_max));
 
         __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
 
+
         if XNN_LIKELY(c >= 8) {
           _mm_storel_epi64((__m128i*) output, vout0123456701234567);
           output += 8;
diff --git a/src/qs8-dwconv/gen/up24x9-minmax-sse41-mul16.c b/src/qs8-dwconv/gen/up24x9-minmax-sse41-mul16.c
index 29b11f3..b681da4 100644
--- a/src/qs8-dwconv/gen/up24x9-minmax-sse41-mul16.c
+++ b/src/qs8-dwconv/gen/up24x9-minmax-sse41-mul16.c
@@ -351,8 +351,8 @@
 
       w = (const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 216 * sizeof(int8_t));
 
-      const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-      const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+      const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+      const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
       const __m128i vacc13 = _mm_shuffle_epi32(vacc0123, _MM_SHUFFLE(3, 3, 1, 1));
       const __m128i vprod02 = _mm_add_epi64(_mm_mul_epi32(vacc0123, vmultiplier), vrounding);
@@ -393,7 +393,7 @@
       const __m128i vq31prodGHIJ = _mm_blend_epi16(vq31prodGI, vq31prodHJ, 0xCC);
       const __m128i vq31prodKLMN = _mm_blend_epi16(vq31prodKM, vq31prodLN, 0xCC);
 
-      const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+      const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
       const __m128i vrem0123 =
         _mm_add_epi32(_mm_and_si128(vq31prod0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0123));
       const __m128i vrem4567 =
@@ -407,8 +407,8 @@
       const __m128i vremKLMN =
         _mm_add_epi32(_mm_and_si128(vq31prodKLMN, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prodKLMN));
 
-      const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-      const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+      const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+      const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
       vacc0123 =
         _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
       vacc4567 =
@@ -422,20 +422,23 @@
       vaccKLMN =
         _mm_sub_epi32(_mm_sra_epi32(vq31prodKLMN, vshift), _mm_cmpgt_epi32(vremKLMN, vremainder_threshold));
 
-      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
       __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
       __m128i voutGHIJKLMN = _mm_adds_epi16(_mm_packs_epi32(vaccGHIJ, vaccKLMN), voutput_zero_point);
 
-      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-      vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
-      vout89ABCDEF = _mm_min_epi16(_mm_max_epi16(vout89ABCDEF, voutput_min), voutput_max);
-      voutGHIJKLMN = _mm_min_epi16(_mm_max_epi16(voutGHIJKLMN, voutput_min), voutput_max);
 
       __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF);
       __m128i voutGHIJKLMNGHIJKLMN = _mm_packs_epi16(voutGHIJKLMN, voutGHIJKLMN);
 
+      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4.output_min);
+      vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min);
+      voutGHIJKLMNGHIJKLMN = _mm_max_epi8(voutGHIJKLMNGHIJKLMN, voutput_min);
+
+      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse4.output_max);
+      vout0123456789ABCDEF = _mm_min_epi8(vout0123456789ABCDEF, voutput_max);
+      voutGHIJKLMNGHIJKLMN = _mm_min_epi8(voutGHIJKLMNGHIJKLMN, voutput_max);
+
       _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
       _mm_storel_epi64((__m128i*) (output + 16), voutGHIJKLMNGHIJKLMN);
       output += 24;
@@ -567,8 +570,8 @@
         w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t));
         k += 8;
 
-        const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-        const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+        const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+        const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
         const __m128i vacc13 = _mm_shuffle_epi32(vacc0123, _MM_SHUFFLE(3, 3, 1, 1));
         const __m128i vacc57 = _mm_shuffle_epi32(vacc4567, _MM_SHUFFLE(3, 3, 1, 1));
@@ -587,28 +590,28 @@
         const __m128i vq31prod0123 = _mm_blend_epi16(vq31prod02, vq31prod13, 0xCC);
         const __m128i vq31prod4567 = _mm_blend_epi16(vq31prod46, vq31prod57, 0xCC);
 
-        const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+        const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
         const __m128i vrem0123 =
           _mm_add_epi32(_mm_and_si128(vq31prod0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0123));
         const __m128i vrem4567 =
           _mm_add_epi32(_mm_and_si128(vq31prod4567, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod4567));
 
-        const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-        const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+        const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+        const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
         vacc0123 =
           _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
         vacc4567 =
           _mm_sub_epi32(_mm_sra_epi32(vq31prod4567, vshift), _mm_cmpgt_epi32(vrem4567, vremainder_threshold));
 
-        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
         __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
 
-        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-        vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
 
         __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
 
+        vout0123456701234567 = _mm_max_epi8(vout0123456701234567, _mm_load_si128((const __m128i*) params->sse4.output_min));
+        vout0123456701234567 = _mm_min_epi8(vout0123456701234567, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
         if XNN_LIKELY(c >= 8) {
           _mm_storel_epi64((__m128i*) output, vout0123456701234567);
           output += 8;
diff --git a/src/qs8-dwconv/gen/up24x9-minmax-sse41-mul32.c b/src/qs8-dwconv/gen/up24x9-minmax-sse41-mul32.c
index c587239..65decb9 100644
--- a/src/qs8-dwconv/gen/up24x9-minmax-sse41-mul32.c
+++ b/src/qs8-dwconv/gen/up24x9-minmax-sse41-mul32.c
@@ -280,8 +280,8 @@
 
       w = (const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 216 * sizeof(int8_t));
 
-      const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-      const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+      const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+      const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
       const __m128i vacc13 = _mm_shuffle_epi32(vacc0123, _MM_SHUFFLE(3, 3, 1, 1));
       const __m128i vprod02 = _mm_add_epi64(_mm_mul_epi32(vacc0123, vmultiplier), vrounding);
@@ -322,7 +322,7 @@
       const __m128i vq31prodGHIJ = _mm_blend_epi16(vq31prodGI, vq31prodHJ, 0xCC);
       const __m128i vq31prodKLMN = _mm_blend_epi16(vq31prodKM, vq31prodLN, 0xCC);
 
-      const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+      const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
       const __m128i vrem0123 =
         _mm_add_epi32(_mm_and_si128(vq31prod0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0123));
       const __m128i vrem4567 =
@@ -336,8 +336,8 @@
       const __m128i vremKLMN =
         _mm_add_epi32(_mm_and_si128(vq31prodKLMN, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prodKLMN));
 
-      const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-      const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+      const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+      const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
       vacc0123 =
         _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
       vacc4567 =
@@ -351,19 +351,19 @@
       vaccKLMN =
         _mm_sub_epi32(_mm_sra_epi32(vq31prodKLMN, vshift), _mm_cmpgt_epi32(vremKLMN, vremainder_threshold));
 
-      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
       __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
       __m128i voutGHIJKLMN = _mm_adds_epi16(_mm_packs_epi32(vaccGHIJ, vaccKLMN), voutput_zero_point);
 
-      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-      vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
-      vout89ABCDEF = _mm_min_epi16(_mm_max_epi16(vout89ABCDEF, voutput_min), voutput_max);
-      voutGHIJKLMN = _mm_min_epi16(_mm_max_epi16(voutGHIJKLMN, voutput_min), voutput_max);
-
+      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4.output_min);
+      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse4.output_max);
       __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF);
+      vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min);
+      vout0123456789ABCDEF = _mm_min_epi8(vout0123456789ABCDEF, voutput_max);
       __m128i voutGHIJKLMNGHIJKLMN = _mm_packs_epi16(voutGHIJKLMN, voutGHIJKLMN);
+      voutGHIJKLMNGHIJKLMN = _mm_max_epi8(voutGHIJKLMNGHIJKLMN, voutput_min);
+      voutGHIJKLMNGHIJKLMN = _mm_min_epi8(voutGHIJKLMNGHIJKLMN, voutput_max);
 
       _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
       _mm_storel_epi64((__m128i*) (output + 16), voutGHIJKLMNGHIJKLMN);
@@ -432,8 +432,8 @@
         w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
         k += 4;
 
-        const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-        const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+        const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+        const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
         const __m128i vacc13 = _mm_shuffle_epi32(vacc0123, _MM_SHUFFLE(3, 3, 1, 1));
 
@@ -445,23 +445,21 @@
 
         const __m128i vq31prod0123 = _mm_blend_epi16(vq31prod02, vq31prod13, 0xCC);
 
-        const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+        const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
         const __m128i vrem0123 =
           _mm_add_epi32(_mm_and_si128(vq31prod0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0123));
 
-        const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-        const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+        const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+        const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
         vacc0123 =
           _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
 
-        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
         __m128i vout0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc0123), voutput_zero_point);
 
-        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-        vout0123 = _mm_min_epi16(_mm_max_epi16(vout0123, voutput_min), voutput_max);
-
         vout0123 = _mm_packs_epi16(vout0123, vout0123);
+        vout0123 = _mm_max_epi8(vout0123, _mm_load_si128((const __m128i*) params->sse4.output_min));
+        vout0123 = _mm_min_epi8(vout0123, _mm_load_si128((const __m128i*) params->sse4.output_max));
 
         if XNN_LIKELY(c >= 4) {
           _mm_storeu_si32(output, vout0123);
diff --git a/src/qs8-dwconv/gen/up24x9-minmax-ssse3-mul16.c b/src/qs8-dwconv/gen/up24x9-minmax-ssse3-mul16.c
index d3934a0..46cd248 100644
--- a/src/qs8-dwconv/gen/up24x9-minmax-ssse3-mul16.c
+++ b/src/qs8-dwconv/gen/up24x9-minmax-ssse3-mul16.c
@@ -461,7 +461,7 @@
         _mm_add_epi32(_mm_and_si128(vq31prodKLMN, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prodKLMN));
 
       const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-      const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+      const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse2.shift);
       vacc0123 =
         _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
       vacc4567 =
@@ -481,14 +481,19 @@
       __m128i voutGHIJKLMN = _mm_adds_epi16(_mm_packs_epi32(vaccGHIJ, vaccKLMN), voutput_zero_point);
 
       const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
+      vout01234567 = _mm_max_epi16(vout01234567, voutput_min);
+      vout89ABCDEF = _mm_max_epi16(vout89ABCDEF, voutput_min);
+      voutGHIJKLMN = _mm_max_epi16(voutGHIJKLMN, voutput_min);
+
       const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-      vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
-      vout89ABCDEF = _mm_min_epi16(_mm_max_epi16(vout89ABCDEF, voutput_min), voutput_max);
-      voutGHIJKLMN = _mm_min_epi16(_mm_max_epi16(voutGHIJKLMN, voutput_min), voutput_max);
+      vout01234567 = _mm_min_epi16(vout01234567, voutput_max);
+      vout89ABCDEF = _mm_min_epi16(vout89ABCDEF, voutput_max);
+      voutGHIJKLMN = _mm_min_epi16(voutGHIJKLMN, voutput_max);
 
       __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF);
       __m128i voutGHIJKLMNGHIJKLMN = _mm_packs_epi16(voutGHIJKLMN, voutGHIJKLMN);
 
+
       _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
       _mm_storel_epi64((__m128i*) (output + 16), voutGHIJKLMNGHIJKLMN);
       output += 24;
@@ -667,7 +672,7 @@
           _mm_add_epi32(_mm_and_si128(vq31prod4567, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod4567));
 
         const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-        const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+        const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse2.shift);
         vacc0123 =
           _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
         vacc4567 =
@@ -676,12 +681,12 @@
         const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
         __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
 
-        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-        vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
+        vout01234567 = _mm_max_epi16(vout01234567, _mm_load_si128((const __m128i*) params->sse2.output_min));
+        vout01234567 = _mm_min_epi16(vout01234567, _mm_load_si128((const __m128i*) params->sse2.output_max));
 
         __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
 
+
         if XNN_LIKELY(c >= 8) {
           _mm_storel_epi64((__m128i*) output, vout0123456701234567);
           output += 8;
diff --git a/src/qs8-dwconv/gen/up24x9-minmax-xop-mul32.c b/src/qs8-dwconv/gen/up24x9-minmax-xop-mul32.c
index 6924e5c..9aa0bda 100644
--- a/src/qs8-dwconv/gen/up24x9-minmax-xop-mul32.c
+++ b/src/qs8-dwconv/gen/up24x9-minmax-xop-mul32.c
@@ -285,8 +285,8 @@
 
       w = (const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 216 * sizeof(int8_t));
 
-      const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-      const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+      const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+      const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
       const __m128i vacc13 = _mm_shuffle_epi32(vacc0123, _MM_SHUFFLE(3, 3, 1, 1));
       const __m128i vprod02 = _mm_add_epi64(_mm_mul_epi32(vacc0123, vmultiplier), vrounding);
@@ -327,7 +327,7 @@
       const __m128i vq31prodGHIJ = _mm_blend_epi16(vq31prodGI, vq31prodHJ, 0xCC);
       const __m128i vq31prodKLMN = _mm_blend_epi16(vq31prodKM, vq31prodLN, 0xCC);
 
-      const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+      const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
       const __m128i vrem0123 =
         _mm_add_epi32(_mm_and_si128(vq31prod0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0123));
       const __m128i vrem4567 =
@@ -341,8 +341,8 @@
       const __m128i vremKLMN =
         _mm_add_epi32(_mm_and_si128(vq31prodKLMN, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prodKLMN));
 
-      const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-      const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+      const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+      const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
       vacc0123 =
         _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
       vacc4567 =
@@ -356,19 +356,19 @@
       vaccKLMN =
         _mm_sub_epi32(_mm_sra_epi32(vq31prodKLMN, vshift), _mm_cmpgt_epi32(vremKLMN, vremainder_threshold));
 
-      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
       __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
       __m128i voutGHIJKLMN = _mm_adds_epi16(_mm_packs_epi32(vaccGHIJ, vaccKLMN), voutput_zero_point);
 
-      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-      vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
-      vout89ABCDEF = _mm_min_epi16(_mm_max_epi16(vout89ABCDEF, voutput_min), voutput_max);
-      voutGHIJKLMN = _mm_min_epi16(_mm_max_epi16(voutGHIJKLMN, voutput_min), voutput_max);
-
+      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4.output_min);
+      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse4.output_max);
       __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF);
+      vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min);
+      vout0123456789ABCDEF = _mm_min_epi8(vout0123456789ABCDEF, voutput_max);
       __m128i voutGHIJKLMNGHIJKLMN = _mm_packs_epi16(voutGHIJKLMN, voutGHIJKLMN);
+      voutGHIJKLMNGHIJKLMN = _mm_max_epi8(voutGHIJKLMNGHIJKLMN, voutput_min);
+      voutGHIJKLMNGHIJKLMN = _mm_min_epi8(voutGHIJKLMNGHIJKLMN, voutput_max);
 
       _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
       _mm_storel_epi64((__m128i*) (output + 16), voutGHIJKLMNGHIJKLMN);
@@ -437,8 +437,8 @@
         w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
         k += 4;
 
-        const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-        const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+        const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+        const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
         const __m128i vacc13 = _mm_shuffle_epi32(vacc0123, _MM_SHUFFLE(3, 3, 1, 1));
 
@@ -450,23 +450,21 @@
 
         const __m128i vq31prod0123 = _mm_blend_epi16(vq31prod02, vq31prod13, 0xCC);
 
-        const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+        const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
         const __m128i vrem0123 =
           _mm_add_epi32(_mm_and_si128(vq31prod0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0123));
 
-        const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-        const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+        const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+        const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
         vacc0123 =
           _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
 
-        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
         __m128i vout0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc0123), voutput_zero_point);
 
-        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-        vout0123 = _mm_min_epi16(_mm_max_epi16(vout0123, voutput_min), voutput_max);
-
         vout0123 = _mm_packs_epi16(vout0123, vout0123);
+        vout0123 = _mm_max_epi8(vout0123, _mm_load_si128((const __m128i*) params->sse4.output_min));
+        vout0123 = _mm_min_epi8(vout0123, _mm_load_si128((const __m128i*) params->sse4.output_max));
 
         if XNN_LIKELY(c >= 4) {
           _mm_storeu_si32(output, vout0123);
diff --git a/src/qs8-dwconv/gen/up32x25-minmax-avx2-mul16.c b/src/qs8-dwconv/gen/up32x25-minmax-avx2-mul16.c
index a0452f0..e22d41f 100644
--- a/src/qs8-dwconv/gen/up32x25-minmax-avx2-mul16.c
+++ b/src/qs8-dwconv/gen/up32x25-minmax-avx2-mul16.c
@@ -543,8 +543,8 @@
 
       w = (const void*) ((uintptr_t) w + 32 * sizeof(int32_t) + 800 * sizeof(int8_t));
 
-      const __m256i vmultiplier = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.multiplier));
-      const __m256i vrounding = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.rounding));
+      const __m256i vmultiplier = _mm256_load_si256((const __m256i*) params->avx2.multiplier);
+      const __m256i vrounding = _mm256_load_si256((const __m256i*) params->avx2.rounding);
 
       const __m256i vacc1357 = _mm256_shuffle_epi32(vacc01234567, _MM_SHUFFLE(3, 3, 1, 1));
       const __m256i vacc9BDF = _mm256_shuffle_epi32(vacc89ABCDEF, _MM_SHUFFLE(3, 3, 1, 1));
@@ -574,7 +574,7 @@
       const __m256i vq31prodGHIJKLMN = _mm256_blend_epi16(vq31prodGIKM, vq31prodHJLN, 0xCC);
       const __m256i vq31prodOPQRSTUV = _mm256_blend_epi16(vq31prodOQSU, vq31prodPRTV, 0xCC);
 
-      const __m256i vremainder_mask = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_mask));
+      const __m256i vremainder_mask = _mm256_load_si256((const __m256i*) params->avx2.remainder_mask);
       const __m256i vrem01234567 =
         _mm256_add_epi32(_mm256_and_si256(vq31prod01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod01234567));
       const __m256i vrem89ABCDEF =
@@ -584,8 +584,8 @@
       const __m256i vremOPQRSTUV =
         _mm256_add_epi32(_mm256_and_si256(vq31prodOPQRSTUV, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prodOPQRSTUV));
 
-      const __m256i vremainder_threshold = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_threshold));
-      const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+      const __m256i vremainder_threshold = _mm256_load_si256((const __m256i*) params->avx2.remainder_threshold);
+      const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->avx2.shift);
       vacc01234567 =
         _mm256_sub_epi32(_mm256_sra_epi32(vq31prod01234567, vshift), _mm256_cmpgt_epi32(vrem01234567, vremainder_threshold));
       vacc89ABCDEF =
@@ -595,18 +595,20 @@
       vaccOPQRSTUV =
         _mm256_sub_epi32(_mm256_sra_epi32(vq31prodOPQRSTUV, vshift), _mm256_cmpgt_epi32(vremOPQRSTUV, vremainder_threshold));
 
-      const __m256i voutput_zero_point = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_zero_point));
-      __m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(vacc01234567, vacc89ABCDEF), voutput_zero_point);
-      __m256i voutGHIJOPQRKLMNSTUV = _mm256_adds_epi16(_mm256_packs_epi32(vaccGHIJKLMN, vaccOPQRSTUV), voutput_zero_point);
-
-      const __m256i voutput_min = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_min));
-      const __m256i voutput_max = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_max));
-      vout012389AB4567CDEF = _mm256_min_epi16(_mm256_max_epi16(vout012389AB4567CDEF, voutput_min), voutput_max);
-      voutGHIJOPQRKLMNSTUV = _mm256_min_epi16(_mm256_max_epi16(voutGHIJOPQRKLMNSTUV, voutput_min), voutput_max);
+      const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx2.output_zero_point);
+      const __m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(vacc01234567, vacc89ABCDEF), voutput_zero_point);
+      const __m256i voutGHIJOPQRKLMNSTUV = _mm256_adds_epi16(_mm256_packs_epi32(vaccGHIJKLMN, vaccOPQRSTUV), voutput_zero_point);
 
       __m128i vout0123456789ABCDEF = _mm_shuffle_epi32(_mm_packs_epi16(_mm256_castsi256_si128(vout012389AB4567CDEF), _mm256_extracti128_si256(vout012389AB4567CDEF, 1)), _MM_SHUFFLE(3, 1, 2, 0));
       __m128i voutGHIJKLMNOPQRSTUV = _mm_shuffle_epi32(_mm_packs_epi16(_mm256_castsi256_si128(voutGHIJOPQRKLMNSTUV), _mm256_extracti128_si256(voutGHIJOPQRKLMNSTUV, 1)), _MM_SHUFFLE(3, 1, 2, 0));
 
+      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx2.output_min);
+      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->avx2.output_max);
+      vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min);
+      vout0123456789ABCDEF = _mm_min_epi8(vout0123456789ABCDEF, voutput_max);
+      voutGHIJKLMNOPQRSTUV = _mm_max_epi8(voutGHIJKLMNOPQRSTUV, voutput_min);
+      voutGHIJKLMNOPQRSTUV = _mm_min_epi8(voutGHIJKLMNOPQRSTUV, voutput_max);
+
       _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
       _mm_storeu_si128((__m128i*) (output + 16), voutGHIJKLMNOPQRSTUV);
       output += 32;
@@ -846,8 +848,8 @@
         w = (const void*) ((uintptr_t) w + 16 * sizeof(int32_t));
         k += 16;
 
-        const __m256i vmultiplier = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.multiplier));
-        const __m256i vrounding = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.rounding));
+        const __m256i vmultiplier = _mm256_load_si256((const __m256i*) params->avx2.multiplier);
+        const __m256i vrounding = _mm256_load_si256((const __m256i*) params->avx2.rounding);
 
         const __m256i vacc1357 = _mm256_shuffle_epi32(vacc01234567, _MM_SHUFFLE(3, 3, 1, 1));
         const __m256i vacc9BDF = _mm256_shuffle_epi32(vacc89ABCDEF, _MM_SHUFFLE(3, 3, 1, 1));
@@ -865,29 +867,28 @@
         const __m256i vq31prod01234567 = _mm256_blend_epi16(vq31prod0246, vq31prod1357, 0xCC);
         const __m256i vq31prod89ABCDEF = _mm256_blend_epi16(vq31prod8ACE, vq31prod9BDF, 0xCC);
 
-        const __m256i vremainder_mask = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_mask));
+        const __m256i vremainder_mask = _mm256_load_si256((const __m256i*) params->avx2.remainder_mask);
         const __m256i vrem01234567 =
           _mm256_add_epi32(_mm256_and_si256(vq31prod01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod01234567));
         const __m256i vrem89ABCDEF =
           _mm256_add_epi32(_mm256_and_si256(vq31prod89ABCDEF, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod89ABCDEF));
 
-        const __m256i vremainder_threshold = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_threshold));
-        const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+        const __m256i vremainder_threshold = _mm256_load_si256((const __m256i*) params->avx2.remainder_threshold);
+        const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->avx2.shift);
         vacc01234567 =
           _mm256_sub_epi32(_mm256_sra_epi32(vq31prod01234567, vshift), _mm256_cmpgt_epi32(vrem01234567, vremainder_threshold));
         vacc89ABCDEF =
           _mm256_sub_epi32(_mm256_sra_epi32(vq31prod89ABCDEF, vshift), _mm256_cmpgt_epi32(vrem89ABCDEF, vremainder_threshold));
 
-        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->avx2.output_zero_point);
         __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extracti128_si256(vacc01234567, 1)), voutput_zero_point);
         __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc89ABCDEF), _mm256_extracti128_si256(vacc89ABCDEF, 1)), voutput_zero_point);
 
-        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-        vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
-        vout89ABCDEF = _mm_min_epi16(_mm_max_epi16(vout89ABCDEF, voutput_min), voutput_max);
+        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx2.output_min);
+        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->avx2.output_max);
 
         __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF);
+        vout0123456789ABCDEF = _mm_min_epi8(_mm_max_epi8(vout0123456789ABCDEF, voutput_min), voutput_max);
 
         if XNN_LIKELY(c >= 16) {
           _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
diff --git a/src/qs8-dwconv/gen/up32x25-minmax-avx2-mul32.c b/src/qs8-dwconv/gen/up32x25-minmax-avx2-mul32.c
index 8218e16..268ac2a 100644
--- a/src/qs8-dwconv/gen/up32x25-minmax-avx2-mul32.c
+++ b/src/qs8-dwconv/gen/up32x25-minmax-avx2-mul32.c
@@ -543,8 +543,8 @@
 
       w = (const void*) ((uintptr_t) w + 32 * sizeof(int32_t) + 800 * sizeof(int8_t));
 
-      const __m256i vmultiplier = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.multiplier));
-      const __m256i vrounding = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.rounding));
+      const __m256i vmultiplier = _mm256_load_si256((const __m256i*) params->avx2.multiplier);
+      const __m256i vrounding = _mm256_load_si256((const __m256i*) params->avx2.rounding);
 
       const __m256i vacc1357 = _mm256_shuffle_epi32(vacc01234567, _MM_SHUFFLE(3, 3, 1, 1));
       const __m256i vacc9BDF = _mm256_shuffle_epi32(vacc89ABCDEF, _MM_SHUFFLE(3, 3, 1, 1));
@@ -574,7 +574,7 @@
       const __m256i vq31prodGHIJKLMN = _mm256_blend_epi16(vq31prodGIKM, vq31prodHJLN, 0xCC);
       const __m256i vq31prodOPQRSTUV = _mm256_blend_epi16(vq31prodOQSU, vq31prodPRTV, 0xCC);
 
-      const __m256i vremainder_mask = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_mask));
+      const __m256i vremainder_mask = _mm256_load_si256((const __m256i*) params->avx2.remainder_mask);
       const __m256i vrem01234567 =
         _mm256_add_epi32(_mm256_and_si256(vq31prod01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod01234567));
       const __m256i vrem89ABCDEF =
@@ -584,8 +584,8 @@
       const __m256i vremOPQRSTUV =
         _mm256_add_epi32(_mm256_and_si256(vq31prodOPQRSTUV, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prodOPQRSTUV));
 
-      const __m256i vremainder_threshold = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_threshold));
-      const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+      const __m256i vremainder_threshold = _mm256_load_si256((const __m256i*) params->avx2.remainder_threshold);
+      const __m128i vshift = _mm_load_si128((const __m128i*) params->avx2.shift);
       vacc01234567 =
         _mm256_sub_epi32(_mm256_sra_epi32(vq31prod01234567, vshift), _mm256_cmpgt_epi32(vrem01234567, vremainder_threshold));
       vacc89ABCDEF =
@@ -595,18 +595,20 @@
       vaccOPQRSTUV =
         _mm256_sub_epi32(_mm256_sra_epi32(vq31prodOPQRSTUV, vshift), _mm256_cmpgt_epi32(vremOPQRSTUV, vremainder_threshold));
 
-      const __m256i voutput_zero_point = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_zero_point));
+      const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx2.output_zero_point);
       __m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(vacc01234567, vacc89ABCDEF), voutput_zero_point);
       __m256i voutGHIJOPQRKLMNSTUV = _mm256_adds_epi16(_mm256_packs_epi32(vaccGHIJKLMN, vaccOPQRSTUV), voutput_zero_point);
 
-      const __m256i voutput_min = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_min));
-      const __m256i voutput_max = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_max));
-      vout012389AB4567CDEF = _mm256_min_epi16(_mm256_max_epi16(vout012389AB4567CDEF, voutput_min), voutput_max);
-      voutGHIJOPQRKLMNSTUV = _mm256_min_epi16(_mm256_max_epi16(voutGHIJOPQRKLMNSTUV, voutput_min), voutput_max);
-
       __m128i vout0123456789ABCDEF = _mm_shuffle_epi32(_mm_packs_epi16(_mm256_castsi256_si128(vout012389AB4567CDEF), _mm256_extracti128_si256(vout012389AB4567CDEF, 1)), _MM_SHUFFLE(3, 1, 2, 0));
       __m128i voutGHIJKLMNOPQRSTUV = _mm_shuffle_epi32(_mm_packs_epi16(_mm256_castsi256_si128(voutGHIJOPQRKLMNSTUV), _mm256_extracti128_si256(voutGHIJOPQRKLMNSTUV, 1)), _MM_SHUFFLE(3, 1, 2, 0));
 
+      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx2.output_min);
+      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->avx2.output_max);
+      vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min);
+      vout0123456789ABCDEF = _mm_min_epi8(vout0123456789ABCDEF, voutput_max);
+      voutGHIJKLMNOPQRSTUV = _mm_max_epi8(voutGHIJKLMNOPQRSTUV, voutput_min);
+      voutGHIJKLMNOPQRSTUV = _mm_min_epi8(voutGHIJKLMNOPQRSTUV, voutput_max);
+
       _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
       _mm_storeu_si128((__m128i*) (output + 16), voutGHIJKLMNOPQRSTUV);
       output += 32;
@@ -770,8 +772,8 @@
         w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t));
         k += 8;
 
-        const __m256i vmultiplier = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.multiplier));
-        const __m256i vrounding = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.rounding));
+        const __m256i vmultiplier = _mm256_load_si256((const __m256i*) params->avx2.multiplier);
+        const __m256i vrounding = _mm256_load_si256((const __m256i*) params->avx2.rounding);
 
         const __m256i vacc1357 = _mm256_shuffle_epi32(vacc01234567, _MM_SHUFFLE(3, 3, 1, 1));
 
@@ -783,23 +785,23 @@
 
         const __m256i vq31prod01234567 = _mm256_blend_epi16(vq31prod0246, vq31prod1357, 0xCC);
 
-        const __m256i vremainder_mask = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_mask));
+        const __m256i vremainder_mask = _mm256_load_si256((const __m256i*) params->avx2.remainder_mask);
         const __m256i vrem01234567 =
           _mm256_add_epi32(_mm256_and_si256(vq31prod01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod01234567));
 
-        const __m256i vremainder_threshold = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_threshold));
-        const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+        const __m256i vremainder_threshold = _mm256_load_si256((const __m256i*) params->avx2.remainder_threshold);
+        const __m128i vshift = _mm_load_si128((const __m128i*) params->avx2.shift);
         vacc01234567 =
           _mm256_sub_epi32(_mm256_sra_epi32(vq31prod01234567, vshift), _mm256_cmpgt_epi32(vrem01234567, vremainder_threshold));
 
-        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->avx2.output_zero_point);
         __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extracti128_si256(vacc01234567, 1)), voutput_zero_point);
 
-        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-        vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
-
         __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
+        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->avx2.output_max);
+        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx2.output_min);
+        vout0123456701234567 = _mm_min_epi8(vout0123456701234567, voutput_max);
+        vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
 
         if XNN_LIKELY(c >= 8) {
           _mm_storel_epi64((__m128i*) output, vout0123456701234567);
diff --git a/src/qs8-dwconv/gen/up32x9-minmax-avx2-mul16.c b/src/qs8-dwconv/gen/up32x9-minmax-avx2-mul16.c
index 4972c11..a7be837 100644
--- a/src/qs8-dwconv/gen/up32x9-minmax-avx2-mul16.c
+++ b/src/qs8-dwconv/gen/up32x9-minmax-avx2-mul16.c
@@ -223,8 +223,8 @@
 
       w = (const void*) ((uintptr_t) w + 32 * sizeof(int32_t) + 288 * sizeof(int8_t));
 
-      const __m256i vmultiplier = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.multiplier));
-      const __m256i vrounding = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.rounding));
+      const __m256i vmultiplier = _mm256_load_si256((const __m256i*) params->avx2.multiplier);
+      const __m256i vrounding = _mm256_load_si256((const __m256i*) params->avx2.rounding);
 
       const __m256i vacc1357 = _mm256_shuffle_epi32(vacc01234567, _MM_SHUFFLE(3, 3, 1, 1));
       const __m256i vacc9BDF = _mm256_shuffle_epi32(vacc89ABCDEF, _MM_SHUFFLE(3, 3, 1, 1));
@@ -254,7 +254,7 @@
       const __m256i vq31prodGHIJKLMN = _mm256_blend_epi16(vq31prodGIKM, vq31prodHJLN, 0xCC);
       const __m256i vq31prodOPQRSTUV = _mm256_blend_epi16(vq31prodOQSU, vq31prodPRTV, 0xCC);
 
-      const __m256i vremainder_mask = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_mask));
+      const __m256i vremainder_mask = _mm256_load_si256((const __m256i*) params->avx2.remainder_mask);
       const __m256i vrem01234567 =
         _mm256_add_epi32(_mm256_and_si256(vq31prod01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod01234567));
       const __m256i vrem89ABCDEF =
@@ -264,8 +264,8 @@
       const __m256i vremOPQRSTUV =
         _mm256_add_epi32(_mm256_and_si256(vq31prodOPQRSTUV, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prodOPQRSTUV));
 
-      const __m256i vremainder_threshold = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_threshold));
-      const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+      const __m256i vremainder_threshold = _mm256_load_si256((const __m256i*) params->avx2.remainder_threshold);
+      const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->avx2.shift);
       vacc01234567 =
         _mm256_sub_epi32(_mm256_sra_epi32(vq31prod01234567, vshift), _mm256_cmpgt_epi32(vrem01234567, vremainder_threshold));
       vacc89ABCDEF =
@@ -275,18 +275,20 @@
       vaccOPQRSTUV =
         _mm256_sub_epi32(_mm256_sra_epi32(vq31prodOPQRSTUV, vshift), _mm256_cmpgt_epi32(vremOPQRSTUV, vremainder_threshold));
 
-      const __m256i voutput_zero_point = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_zero_point));
-      __m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(vacc01234567, vacc89ABCDEF), voutput_zero_point);
-      __m256i voutGHIJOPQRKLMNSTUV = _mm256_adds_epi16(_mm256_packs_epi32(vaccGHIJKLMN, vaccOPQRSTUV), voutput_zero_point);
-
-      const __m256i voutput_min = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_min));
-      const __m256i voutput_max = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_max));
-      vout012389AB4567CDEF = _mm256_min_epi16(_mm256_max_epi16(vout012389AB4567CDEF, voutput_min), voutput_max);
-      voutGHIJOPQRKLMNSTUV = _mm256_min_epi16(_mm256_max_epi16(voutGHIJOPQRKLMNSTUV, voutput_min), voutput_max);
+      const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx2.output_zero_point);
+      const __m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(vacc01234567, vacc89ABCDEF), voutput_zero_point);
+      const __m256i voutGHIJOPQRKLMNSTUV = _mm256_adds_epi16(_mm256_packs_epi32(vaccGHIJKLMN, vaccOPQRSTUV), voutput_zero_point);
 
       __m128i vout0123456789ABCDEF = _mm_shuffle_epi32(_mm_packs_epi16(_mm256_castsi256_si128(vout012389AB4567CDEF), _mm256_extracti128_si256(vout012389AB4567CDEF, 1)), _MM_SHUFFLE(3, 1, 2, 0));
       __m128i voutGHIJKLMNOPQRSTUV = _mm_shuffle_epi32(_mm_packs_epi16(_mm256_castsi256_si128(voutGHIJOPQRKLMNSTUV), _mm256_extracti128_si256(voutGHIJOPQRKLMNSTUV, 1)), _MM_SHUFFLE(3, 1, 2, 0));
 
+      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx2.output_min);
+      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->avx2.output_max);
+      vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min);
+      vout0123456789ABCDEF = _mm_min_epi8(vout0123456789ABCDEF, voutput_max);
+      voutGHIJKLMNOPQRSTUV = _mm_max_epi8(voutGHIJKLMNOPQRSTUV, voutput_min);
+      voutGHIJKLMNOPQRSTUV = _mm_min_epi8(voutGHIJKLMNOPQRSTUV, voutput_max);
+
       _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
       _mm_storeu_si128((__m128i*) (output + 16), voutGHIJKLMNOPQRSTUV);
       output += 32;
@@ -382,8 +384,8 @@
         w = (const void*) ((uintptr_t) w + 16 * sizeof(int32_t));
         k += 16;
 
-        const __m256i vmultiplier = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.multiplier));
-        const __m256i vrounding = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.rounding));
+        const __m256i vmultiplier = _mm256_load_si256((const __m256i*) params->avx2.multiplier);
+        const __m256i vrounding = _mm256_load_si256((const __m256i*) params->avx2.rounding);
 
         const __m256i vacc1357 = _mm256_shuffle_epi32(vacc01234567, _MM_SHUFFLE(3, 3, 1, 1));
         const __m256i vacc9BDF = _mm256_shuffle_epi32(vacc89ABCDEF, _MM_SHUFFLE(3, 3, 1, 1));
@@ -401,29 +403,28 @@
         const __m256i vq31prod01234567 = _mm256_blend_epi16(vq31prod0246, vq31prod1357, 0xCC);
         const __m256i vq31prod89ABCDEF = _mm256_blend_epi16(vq31prod8ACE, vq31prod9BDF, 0xCC);
 
-        const __m256i vremainder_mask = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_mask));
+        const __m256i vremainder_mask = _mm256_load_si256((const __m256i*) params->avx2.remainder_mask);
         const __m256i vrem01234567 =
           _mm256_add_epi32(_mm256_and_si256(vq31prod01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod01234567));
         const __m256i vrem89ABCDEF =
           _mm256_add_epi32(_mm256_and_si256(vq31prod89ABCDEF, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod89ABCDEF));
 
-        const __m256i vremainder_threshold = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_threshold));
-        const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+        const __m256i vremainder_threshold = _mm256_load_si256((const __m256i*) params->avx2.remainder_threshold);
+        const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->avx2.shift);
         vacc01234567 =
           _mm256_sub_epi32(_mm256_sra_epi32(vq31prod01234567, vshift), _mm256_cmpgt_epi32(vrem01234567, vremainder_threshold));
         vacc89ABCDEF =
           _mm256_sub_epi32(_mm256_sra_epi32(vq31prod89ABCDEF, vshift), _mm256_cmpgt_epi32(vrem89ABCDEF, vremainder_threshold));
 
-        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->avx2.output_zero_point);
         __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extracti128_si256(vacc01234567, 1)), voutput_zero_point);
         __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc89ABCDEF), _mm256_extracti128_si256(vacc89ABCDEF, 1)), voutput_zero_point);
 
-        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-        vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
-        vout89ABCDEF = _mm_min_epi16(_mm_max_epi16(vout89ABCDEF, voutput_min), voutput_max);
+        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx2.output_min);
+        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->avx2.output_max);
 
         __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF);
+        vout0123456789ABCDEF = _mm_min_epi8(_mm_max_epi8(vout0123456789ABCDEF, voutput_min), voutput_max);
 
         if XNN_LIKELY(c >= 16) {
           _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
diff --git a/src/qs8-dwconv/gen/up32x9-minmax-avx2-mul32.c b/src/qs8-dwconv/gen/up32x9-minmax-avx2-mul32.c
index 579b30b..1e2a93e 100644
--- a/src/qs8-dwconv/gen/up32x9-minmax-avx2-mul32.c
+++ b/src/qs8-dwconv/gen/up32x9-minmax-avx2-mul32.c
@@ -223,8 +223,8 @@
 
       w = (const void*) ((uintptr_t) w + 32 * sizeof(int32_t) + 288 * sizeof(int8_t));
 
-      const __m256i vmultiplier = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.multiplier));
-      const __m256i vrounding = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.rounding));
+      const __m256i vmultiplier = _mm256_load_si256((const __m256i*) params->avx2.multiplier);
+      const __m256i vrounding = _mm256_load_si256((const __m256i*) params->avx2.rounding);
 
       const __m256i vacc1357 = _mm256_shuffle_epi32(vacc01234567, _MM_SHUFFLE(3, 3, 1, 1));
       const __m256i vacc9BDF = _mm256_shuffle_epi32(vacc89ABCDEF, _MM_SHUFFLE(3, 3, 1, 1));
@@ -254,7 +254,7 @@
       const __m256i vq31prodGHIJKLMN = _mm256_blend_epi16(vq31prodGIKM, vq31prodHJLN, 0xCC);
       const __m256i vq31prodOPQRSTUV = _mm256_blend_epi16(vq31prodOQSU, vq31prodPRTV, 0xCC);
 
-      const __m256i vremainder_mask = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_mask));
+      const __m256i vremainder_mask = _mm256_load_si256((const __m256i*) params->avx2.remainder_mask);
       const __m256i vrem01234567 =
         _mm256_add_epi32(_mm256_and_si256(vq31prod01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod01234567));
       const __m256i vrem89ABCDEF =
@@ -264,8 +264,8 @@
       const __m256i vremOPQRSTUV =
         _mm256_add_epi32(_mm256_and_si256(vq31prodOPQRSTUV, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prodOPQRSTUV));
 
-      const __m256i vremainder_threshold = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_threshold));
-      const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+      const __m256i vremainder_threshold = _mm256_load_si256((const __m256i*) params->avx2.remainder_threshold);
+      const __m128i vshift = _mm_load_si128((const __m128i*) params->avx2.shift);
       vacc01234567 =
         _mm256_sub_epi32(_mm256_sra_epi32(vq31prod01234567, vshift), _mm256_cmpgt_epi32(vrem01234567, vremainder_threshold));
       vacc89ABCDEF =
@@ -275,18 +275,20 @@
       vaccOPQRSTUV =
         _mm256_sub_epi32(_mm256_sra_epi32(vq31prodOPQRSTUV, vshift), _mm256_cmpgt_epi32(vremOPQRSTUV, vremainder_threshold));
 
-      const __m256i voutput_zero_point = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_zero_point));
+      const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx2.output_zero_point);
       __m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(vacc01234567, vacc89ABCDEF), voutput_zero_point);
       __m256i voutGHIJOPQRKLMNSTUV = _mm256_adds_epi16(_mm256_packs_epi32(vaccGHIJKLMN, vaccOPQRSTUV), voutput_zero_point);
 
-      const __m256i voutput_min = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_min));
-      const __m256i voutput_max = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_max));
-      vout012389AB4567CDEF = _mm256_min_epi16(_mm256_max_epi16(vout012389AB4567CDEF, voutput_min), voutput_max);
-      voutGHIJOPQRKLMNSTUV = _mm256_min_epi16(_mm256_max_epi16(voutGHIJOPQRKLMNSTUV, voutput_min), voutput_max);
-
       __m128i vout0123456789ABCDEF = _mm_shuffle_epi32(_mm_packs_epi16(_mm256_castsi256_si128(vout012389AB4567CDEF), _mm256_extracti128_si256(vout012389AB4567CDEF, 1)), _MM_SHUFFLE(3, 1, 2, 0));
       __m128i voutGHIJKLMNOPQRSTUV = _mm_shuffle_epi32(_mm_packs_epi16(_mm256_castsi256_si128(voutGHIJOPQRKLMNSTUV), _mm256_extracti128_si256(voutGHIJOPQRKLMNSTUV, 1)), _MM_SHUFFLE(3, 1, 2, 0));
 
+      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx2.output_min);
+      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->avx2.output_max);
+      vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min);
+      vout0123456789ABCDEF = _mm_min_epi8(vout0123456789ABCDEF, voutput_max);
+      voutGHIJKLMNOPQRSTUV = _mm_max_epi8(voutGHIJKLMNOPQRSTUV, voutput_min);
+      voutGHIJKLMNOPQRSTUV = _mm_min_epi8(voutGHIJKLMNOPQRSTUV, voutput_max);
+
       _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
       _mm_storeu_si128((__m128i*) (output + 16), voutGHIJKLMNOPQRSTUV);
       output += 32;
@@ -354,8 +356,8 @@
         w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t));
         k += 8;
 
-        const __m256i vmultiplier = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.multiplier));
-        const __m256i vrounding = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.rounding));
+        const __m256i vmultiplier = _mm256_load_si256((const __m256i*) params->avx2.multiplier);
+        const __m256i vrounding = _mm256_load_si256((const __m256i*) params->avx2.rounding);
 
         const __m256i vacc1357 = _mm256_shuffle_epi32(vacc01234567, _MM_SHUFFLE(3, 3, 1, 1));
 
@@ -367,23 +369,23 @@
 
         const __m256i vq31prod01234567 = _mm256_blend_epi16(vq31prod0246, vq31prod1357, 0xCC);
 
-        const __m256i vremainder_mask = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_mask));
+        const __m256i vremainder_mask = _mm256_load_si256((const __m256i*) params->avx2.remainder_mask);
         const __m256i vrem01234567 =
           _mm256_add_epi32(_mm256_and_si256(vq31prod01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod01234567));
 
-        const __m256i vremainder_threshold = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_threshold));
-        const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+        const __m256i vremainder_threshold = _mm256_load_si256((const __m256i*) params->avx2.remainder_threshold);
+        const __m128i vshift = _mm_load_si128((const __m128i*) params->avx2.shift);
         vacc01234567 =
           _mm256_sub_epi32(_mm256_sra_epi32(vq31prod01234567, vshift), _mm256_cmpgt_epi32(vrem01234567, vremainder_threshold));
 
-        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->avx2.output_zero_point);
         __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extracti128_si256(vacc01234567, 1)), voutput_zero_point);
 
-        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-        vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
-
         __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
+        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->avx2.output_max);
+        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx2.output_min);
+        vout0123456701234567 = _mm_min_epi8(vout0123456701234567, voutput_max);
+        vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
 
         if XNN_LIKELY(c >= 8) {
           _mm_storel_epi64((__m128i*) output, vout0123456701234567);
diff --git a/src/qs8-dwconv/gen/up8x25-minmax-avx-mul16.c b/src/qs8-dwconv/gen/up8x25-minmax-avx-mul16.c
index 0ab2a49..b23e417 100644
--- a/src/qs8-dwconv/gen/up8x25-minmax-avx-mul16.c
+++ b/src/qs8-dwconv/gen/up8x25-minmax-avx-mul16.c
@@ -491,8 +491,8 @@
 
       w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 200 * sizeof(int8_t));
 
-      const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-      const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+      const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+      const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
       const __m128i vacc13 = _mm_shuffle_epi32(vacc0123, _MM_SHUFFLE(3, 3, 1, 1));
       const __m128i vprod02 = _mm_add_epi64(_mm_mul_epi32(vacc0123, vmultiplier), vrounding);
@@ -509,28 +509,31 @@
       const __m128i vq31prod0123 = _mm_blend_epi16(vq31prod02, vq31prod13, 0xCC);
       const __m128i vq31prod4567 = _mm_blend_epi16(vq31prod46, vq31prod57, 0xCC);
 
-      const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+      const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
       const __m128i vrem0123 =
         _mm_add_epi32(_mm_and_si128(vq31prod0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0123));
       const __m128i vrem4567 =
         _mm_add_epi32(_mm_and_si128(vq31prod4567, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod4567));
 
-      const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-      const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+      const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+      const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
       vacc0123 =
         _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
       vacc4567 =
         _mm_sub_epi32(_mm_sra_epi32(vq31prod4567, vshift), _mm_cmpgt_epi32(vrem4567, vremainder_threshold));
 
-      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
 
-      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-      vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
 
       __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
 
+      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4.output_min);
+      vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
+
+      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse4.output_max);
+      vout0123456701234567 = _mm_min_epi8(vout0123456701234567, voutput_max);
+
       _mm_storel_epi64((__m128i*) output, vout0123456701234567);
       output += 8;
     }
@@ -841,8 +844,8 @@
         vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vp24x01234567lo, vp24x01234567hi));
 
 
-        const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-        const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+        const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+        const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
         const __m128i vacc13 = _mm_shuffle_epi32(vacc0123, _MM_SHUFFLE(3, 3, 1, 1));
         const __m128i vacc57 = _mm_shuffle_epi32(vacc4567, _MM_SHUFFLE(3, 3, 1, 1));
@@ -861,28 +864,28 @@
         const __m128i vq31prod0123 = _mm_blend_epi16(vq31prod02, vq31prod13, 0xCC);
         const __m128i vq31prod4567 = _mm_blend_epi16(vq31prod46, vq31prod57, 0xCC);
 
-        const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+        const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
         const __m128i vrem0123 =
           _mm_add_epi32(_mm_and_si128(vq31prod0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0123));
         const __m128i vrem4567 =
           _mm_add_epi32(_mm_and_si128(vq31prod4567, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod4567));
 
-        const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-        const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+        const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+        const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
         vacc0123 =
           _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
         vacc4567 =
           _mm_sub_epi32(_mm_sra_epi32(vq31prod4567, vshift), _mm_cmpgt_epi32(vrem4567, vremainder_threshold));
 
-        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
         __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
 
-        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-        vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
 
         __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
 
+        vout0123456701234567 = _mm_max_epi8(vout0123456701234567, _mm_load_si128((const __m128i*) params->sse4.output_min));
+        vout0123456701234567 = _mm_min_epi8(vout0123456701234567, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
         if (c & 4) {
           *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
           vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
diff --git a/src/qs8-dwconv/gen/up8x25-minmax-avx-mul32.c b/src/qs8-dwconv/gen/up8x25-minmax-avx-mul32.c
index ae3ec16..889b7a8 100644
--- a/src/qs8-dwconv/gen/up8x25-minmax-avx-mul32.c
+++ b/src/qs8-dwconv/gen/up8x25-minmax-avx-mul32.c
@@ -392,8 +392,8 @@
 
       w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 200 * sizeof(int8_t));
 
-      const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-      const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+      const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+      const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
       const __m128i vacc13 = _mm_shuffle_epi32(vacc0123, _MM_SHUFFLE(3, 3, 1, 1));
       const __m128i vprod02 = _mm_add_epi64(_mm_mul_epi32(vacc0123, vmultiplier), vrounding);
@@ -410,27 +410,27 @@
       const __m128i vq31prod0123 = _mm_blend_epi16(vq31prod02, vq31prod13, 0xCC);
       const __m128i vq31prod4567 = _mm_blend_epi16(vq31prod46, vq31prod57, 0xCC);
 
-      const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+      const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
       const __m128i vrem0123 =
         _mm_add_epi32(_mm_and_si128(vq31prod0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0123));
       const __m128i vrem4567 =
         _mm_add_epi32(_mm_and_si128(vq31prod4567, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod4567));
 
-      const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-      const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+      const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+      const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
       vacc0123 =
         _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
       vacc4567 =
         _mm_sub_epi32(_mm_sra_epi32(vq31prod4567, vshift), _mm_cmpgt_epi32(vrem4567, vremainder_threshold));
 
-      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
 
-      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-      vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
-
+      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4.output_min);
+      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse4.output_max);
       __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
+      vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
+      vout0123456701234567 = _mm_min_epi8(vout0123456701234567, voutput_max);
 
       _mm_storel_epi64((__m128i*) output, vout0123456701234567);
       output += 8;
@@ -594,8 +594,8 @@
         w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
         k += 4;
 
-        const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-        const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+        const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+        const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
         const __m128i vacc13 = _mm_shuffle_epi32(vacc0123, _MM_SHUFFLE(3, 3, 1, 1));
 
@@ -607,23 +607,21 @@
 
         const __m128i vq31prod0123 = _mm_blend_epi16(vq31prod02, vq31prod13, 0xCC);
 
-        const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+        const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
         const __m128i vrem0123 =
           _mm_add_epi32(_mm_and_si128(vq31prod0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0123));
 
-        const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-        const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+        const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+        const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
         vacc0123 =
           _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
 
-        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
         __m128i vout0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc0123), voutput_zero_point);
 
-        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-        vout0123 = _mm_min_epi16(_mm_max_epi16(vout0123, voutput_min), voutput_max);
-
         vout0123 = _mm_packs_epi16(vout0123, vout0123);
+        vout0123 = _mm_max_epi8(vout0123, _mm_load_si128((const __m128i*) params->sse4.output_min));
+        vout0123 = _mm_min_epi8(vout0123, _mm_load_si128((const __m128i*) params->sse4.output_max));
 
         if XNN_LIKELY(c >= 4) {
           _mm_storeu_si32(output, vout0123);
diff --git a/src/qs8-dwconv/gen/up8x25-minmax-avx2-mul32.c b/src/qs8-dwconv/gen/up8x25-minmax-avx2-mul32.c
index cdabcdf..a298d77 100644
--- a/src/qs8-dwconv/gen/up8x25-minmax-avx2-mul32.c
+++ b/src/qs8-dwconv/gen/up8x25-minmax-avx2-mul32.c
@@ -315,8 +315,8 @@
 
       w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 200 * sizeof(int8_t));
 
-      const __m256i vmultiplier = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.multiplier));
-      const __m256i vrounding = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.rounding));
+      const __m256i vmultiplier = _mm256_load_si256((const __m256i*) params->avx2.multiplier);
+      const __m256i vrounding = _mm256_load_si256((const __m256i*) params->avx2.rounding);
 
       const __m256i vacc1357 = _mm256_shuffle_epi32(vacc01234567, _MM_SHUFFLE(3, 3, 1, 1));
 
@@ -328,24 +328,25 @@
 
       const __m256i vq31prod01234567 = _mm256_blend_epi16(vq31prod0246, vq31prod1357, 0xCC);
 
-      const __m256i vremainder_mask = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_mask));
+      const __m256i vremainder_mask = _mm256_load_si256((const __m256i*) params->avx2.remainder_mask);
       const __m256i vrem01234567 =
         _mm256_add_epi32(_mm256_and_si256(vq31prod01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod01234567));
 
-      const __m256i vremainder_threshold = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_threshold));
-      const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+      const __m256i vremainder_threshold = _mm256_load_si256((const __m256i*) params->avx2.remainder_threshold);
+      const __m128i vshift = _mm_load_si128((const __m128i*) params->avx2.shift);
       vacc01234567 =
         _mm256_sub_epi32(_mm256_sra_epi32(vq31prod01234567, vshift), _mm256_cmpgt_epi32(vrem01234567, vremainder_threshold));
 
-      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->avx2.output_zero_point);
       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extracti128_si256(vacc01234567, 1)), voutput_zero_point);
 
-      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-      vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
-
       __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
 
+      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx2.output_min);
+      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->avx2.output_max);
+      vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
+      vout0123456701234567 = _mm_min_epi8(vout0123456701234567, voutput_max);
+
       _mm_storel_epi64((__m128i*) output, vout0123456701234567);
       output += 8;
     }
@@ -480,8 +481,8 @@
         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi24x01234567, vk24x01234567));
 
 
-        const __m256i vmultiplier = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.multiplier));
-        const __m256i vrounding = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.rounding));
+        const __m256i vmultiplier = _mm256_load_si256((const __m256i*) params->avx2.multiplier);
+        const __m256i vrounding = _mm256_load_si256((const __m256i*) params->avx2.rounding);
 
         const __m256i vacc1357 = _mm256_shuffle_epi32(vacc01234567, _MM_SHUFFLE(3, 3, 1, 1));
 
@@ -493,23 +494,23 @@
 
         const __m256i vq31prod01234567 = _mm256_blend_epi16(vq31prod0246, vq31prod1357, 0xCC);
 
-        const __m256i vremainder_mask = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_mask));
+        const __m256i vremainder_mask = _mm256_load_si256((const __m256i*) params->avx2.remainder_mask);
         const __m256i vrem01234567 =
           _mm256_add_epi32(_mm256_and_si256(vq31prod01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod01234567));
 
-        const __m256i vremainder_threshold = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_threshold));
-        const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+        const __m256i vremainder_threshold = _mm256_load_si256((const __m256i*) params->avx2.remainder_threshold);
+        const __m128i vshift = _mm_load_si128((const __m128i*) params->avx2.shift);
         vacc01234567 =
           _mm256_sub_epi32(_mm256_sra_epi32(vq31prod01234567, vshift), _mm256_cmpgt_epi32(vrem01234567, vremainder_threshold));
 
-        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->avx2.output_zero_point);
         __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extracti128_si256(vacc01234567, 1)), voutput_zero_point);
 
-        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-        vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
-
         __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
+        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->avx2.output_max);
+        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx2.output_min);
+        vout0123456701234567 = _mm_min_epi8(vout0123456701234567, voutput_max);
+        vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
 
         if (c & 4) {
           *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
diff --git a/src/qs8-dwconv/gen/up8x25-minmax-sse2-mul16.c b/src/qs8-dwconv/gen/up8x25-minmax-sse2-mul16.c
index 493b0bb..498844c 100644
--- a/src/qs8-dwconv/gen/up8x25-minmax-sse2-mul16.c
+++ b/src/qs8-dwconv/gen/up8x25-minmax-sse2-mul16.c
@@ -537,7 +537,7 @@
         _mm_add_epi32(_mm_and_si128(vq31prod4567, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod4567));
 
       const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-      const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+      const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse2.shift);
       vacc0123 =
         _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
       vacc4567 =
@@ -547,11 +547,14 @@
       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
 
       const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
+      vout01234567 = _mm_max_epi16(vout01234567, voutput_min);
+
       const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-      vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
+      vout01234567 = _mm_min_epi16(vout01234567, voutput_max);
 
       __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
 
+
       _mm_storel_epi64((__m128i*) output, vout0123456701234567);
       output += 8;
     }
@@ -909,7 +912,7 @@
           _mm_add_epi32(_mm_and_si128(vq31prod4567, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod4567));
 
         const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-        const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+        const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse2.shift);
         vacc0123 =
           _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
         vacc4567 =
@@ -918,12 +921,12 @@
         const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
         __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
 
-        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-        vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
+        vout01234567 = _mm_max_epi16(vout01234567, _mm_load_si128((const __m128i*) params->sse2.output_min));
+        vout01234567 = _mm_min_epi16(vout01234567, _mm_load_si128((const __m128i*) params->sse2.output_max));
 
         __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
 
+
         if (c & 4) {
           *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
           vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
diff --git a/src/qs8-dwconv/gen/up8x25-minmax-sse41-mul16.c b/src/qs8-dwconv/gen/up8x25-minmax-sse41-mul16.c
index da1a284..45dfe82 100644
--- a/src/qs8-dwconv/gen/up8x25-minmax-sse41-mul16.c
+++ b/src/qs8-dwconv/gen/up8x25-minmax-sse41-mul16.c
@@ -491,8 +491,8 @@
 
       w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 200 * sizeof(int8_t));
 
-      const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-      const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+      const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+      const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
       const __m128i vacc13 = _mm_shuffle_epi32(vacc0123, _MM_SHUFFLE(3, 3, 1, 1));
       const __m128i vprod02 = _mm_add_epi64(_mm_mul_epi32(vacc0123, vmultiplier), vrounding);
@@ -509,28 +509,31 @@
       const __m128i vq31prod0123 = _mm_blend_epi16(vq31prod02, vq31prod13, 0xCC);
       const __m128i vq31prod4567 = _mm_blend_epi16(vq31prod46, vq31prod57, 0xCC);
 
-      const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+      const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
       const __m128i vrem0123 =
         _mm_add_epi32(_mm_and_si128(vq31prod0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0123));
       const __m128i vrem4567 =
         _mm_add_epi32(_mm_and_si128(vq31prod4567, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod4567));
 
-      const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-      const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+      const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+      const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
       vacc0123 =
         _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
       vacc4567 =
         _mm_sub_epi32(_mm_sra_epi32(vq31prod4567, vshift), _mm_cmpgt_epi32(vrem4567, vremainder_threshold));
 
-      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
 
-      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-      vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
 
       __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
 
+      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4.output_min);
+      vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
+
+      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse4.output_max);
+      vout0123456701234567 = _mm_min_epi8(vout0123456701234567, voutput_max);
+
       _mm_storel_epi64((__m128i*) output, vout0123456701234567);
       output += 8;
     }
@@ -841,8 +844,8 @@
         vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vp24x01234567lo, vp24x01234567hi));
 
 
-        const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-        const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+        const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+        const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
         const __m128i vacc13 = _mm_shuffle_epi32(vacc0123, _MM_SHUFFLE(3, 3, 1, 1));
         const __m128i vacc57 = _mm_shuffle_epi32(vacc4567, _MM_SHUFFLE(3, 3, 1, 1));
@@ -861,28 +864,28 @@
         const __m128i vq31prod0123 = _mm_blend_epi16(vq31prod02, vq31prod13, 0xCC);
         const __m128i vq31prod4567 = _mm_blend_epi16(vq31prod46, vq31prod57, 0xCC);
 
-        const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+        const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
         const __m128i vrem0123 =
           _mm_add_epi32(_mm_and_si128(vq31prod0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0123));
         const __m128i vrem4567 =
           _mm_add_epi32(_mm_and_si128(vq31prod4567, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod4567));
 
-        const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-        const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+        const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+        const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
         vacc0123 =
           _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
         vacc4567 =
           _mm_sub_epi32(_mm_sra_epi32(vq31prod4567, vshift), _mm_cmpgt_epi32(vrem4567, vremainder_threshold));
 
-        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
         __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
 
-        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-        vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
 
         __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
 
+        vout0123456701234567 = _mm_max_epi8(vout0123456701234567, _mm_load_si128((const __m128i*) params->sse4.output_min));
+        vout0123456701234567 = _mm_min_epi8(vout0123456701234567, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
         if (c & 4) {
           *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
           vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
diff --git a/src/qs8-dwconv/gen/up8x25-minmax-sse41-mul32.c b/src/qs8-dwconv/gen/up8x25-minmax-sse41-mul32.c
index ce92c24..3ccbf5a 100644
--- a/src/qs8-dwconv/gen/up8x25-minmax-sse41-mul32.c
+++ b/src/qs8-dwconv/gen/up8x25-minmax-sse41-mul32.c
@@ -392,8 +392,8 @@
 
       w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 200 * sizeof(int8_t));
 
-      const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-      const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+      const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+      const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
       const __m128i vacc13 = _mm_shuffle_epi32(vacc0123, _MM_SHUFFLE(3, 3, 1, 1));
       const __m128i vprod02 = _mm_add_epi64(_mm_mul_epi32(vacc0123, vmultiplier), vrounding);
@@ -410,27 +410,27 @@
       const __m128i vq31prod0123 = _mm_blend_epi16(vq31prod02, vq31prod13, 0xCC);
       const __m128i vq31prod4567 = _mm_blend_epi16(vq31prod46, vq31prod57, 0xCC);
 
-      const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+      const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
       const __m128i vrem0123 =
         _mm_add_epi32(_mm_and_si128(vq31prod0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0123));
       const __m128i vrem4567 =
         _mm_add_epi32(_mm_and_si128(vq31prod4567, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod4567));
 
-      const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-      const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+      const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+      const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
       vacc0123 =
         _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
       vacc4567 =
         _mm_sub_epi32(_mm_sra_epi32(vq31prod4567, vshift), _mm_cmpgt_epi32(vrem4567, vremainder_threshold));
 
-      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
 
-      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-      vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
-
+      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4.output_min);
+      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse4.output_max);
       __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
+      vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
+      vout0123456701234567 = _mm_min_epi8(vout0123456701234567, voutput_max);
 
       _mm_storel_epi64((__m128i*) output, vout0123456701234567);
       output += 8;
@@ -594,8 +594,8 @@
         w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
         k += 4;
 
-        const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-        const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+        const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+        const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
         const __m128i vacc13 = _mm_shuffle_epi32(vacc0123, _MM_SHUFFLE(3, 3, 1, 1));
 
@@ -607,23 +607,21 @@
 
         const __m128i vq31prod0123 = _mm_blend_epi16(vq31prod02, vq31prod13, 0xCC);
 
-        const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+        const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
         const __m128i vrem0123 =
           _mm_add_epi32(_mm_and_si128(vq31prod0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0123));
 
-        const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-        const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+        const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+        const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
         vacc0123 =
           _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
 
-        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
         __m128i vout0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc0123), voutput_zero_point);
 
-        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-        vout0123 = _mm_min_epi16(_mm_max_epi16(vout0123, voutput_min), voutput_max);
-
         vout0123 = _mm_packs_epi16(vout0123, vout0123);
+        vout0123 = _mm_max_epi8(vout0123, _mm_load_si128((const __m128i*) params->sse4.output_min));
+        vout0123 = _mm_min_epi8(vout0123, _mm_load_si128((const __m128i*) params->sse4.output_max));
 
         if XNN_LIKELY(c >= 4) {
           _mm_storeu_si32(output, vout0123);
diff --git a/src/qs8-dwconv/gen/up8x25-minmax-ssse3-mul16.c b/src/qs8-dwconv/gen/up8x25-minmax-ssse3-mul16.c
index 35f720d..5704b3b 100644
--- a/src/qs8-dwconv/gen/up8x25-minmax-ssse3-mul16.c
+++ b/src/qs8-dwconv/gen/up8x25-minmax-ssse3-mul16.c
@@ -537,7 +537,7 @@
         _mm_add_epi32(_mm_and_si128(vq31prod4567, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod4567));
 
       const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-      const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+      const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse2.shift);
       vacc0123 =
         _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
       vacc4567 =
@@ -547,11 +547,14 @@
       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
 
       const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
+      vout01234567 = _mm_max_epi16(vout01234567, voutput_min);
+
       const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-      vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
+      vout01234567 = _mm_min_epi16(vout01234567, voutput_max);
 
       __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
 
+
       _mm_storel_epi64((__m128i*) output, vout0123456701234567);
       output += 8;
     }
@@ -909,7 +912,7 @@
           _mm_add_epi32(_mm_and_si128(vq31prod4567, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod4567));
 
         const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-        const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+        const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse2.shift);
         vacc0123 =
           _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
         vacc4567 =
@@ -918,12 +921,12 @@
         const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
         __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
 
-        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-        vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
+        vout01234567 = _mm_max_epi16(vout01234567, _mm_load_si128((const __m128i*) params->sse2.output_min));
+        vout01234567 = _mm_min_epi16(vout01234567, _mm_load_si128((const __m128i*) params->sse2.output_max));
 
         __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
 
+
         if (c & 4) {
           *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
           vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
diff --git a/src/qs8-dwconv/gen/up8x25-minmax-xop-mul32.c b/src/qs8-dwconv/gen/up8x25-minmax-xop-mul32.c
index 657ece3..a4467b8 100644
--- a/src/qs8-dwconv/gen/up8x25-minmax-xop-mul32.c
+++ b/src/qs8-dwconv/gen/up8x25-minmax-xop-mul32.c
@@ -397,8 +397,8 @@
 
       w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 200 * sizeof(int8_t));
 
-      const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-      const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+      const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+      const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
       const __m128i vacc13 = _mm_shuffle_epi32(vacc0123, _MM_SHUFFLE(3, 3, 1, 1));
       const __m128i vprod02 = _mm_add_epi64(_mm_mul_epi32(vacc0123, vmultiplier), vrounding);
@@ -415,27 +415,27 @@
       const __m128i vq31prod0123 = _mm_blend_epi16(vq31prod02, vq31prod13, 0xCC);
       const __m128i vq31prod4567 = _mm_blend_epi16(vq31prod46, vq31prod57, 0xCC);
 
-      const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+      const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
       const __m128i vrem0123 =
         _mm_add_epi32(_mm_and_si128(vq31prod0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0123));
       const __m128i vrem4567 =
         _mm_add_epi32(_mm_and_si128(vq31prod4567, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod4567));
 
-      const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-      const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+      const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+      const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
       vacc0123 =
         _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
       vacc4567 =
         _mm_sub_epi32(_mm_sra_epi32(vq31prod4567, vshift), _mm_cmpgt_epi32(vrem4567, vremainder_threshold));
 
-      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
 
-      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-      vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
-
+      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4.output_min);
+      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse4.output_max);
       __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
+      vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
+      vout0123456701234567 = _mm_min_epi8(vout0123456701234567, voutput_max);
 
       _mm_storel_epi64((__m128i*) output, vout0123456701234567);
       output += 8;
@@ -599,8 +599,8 @@
         w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
         k += 4;
 
-        const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-        const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+        const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+        const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
         const __m128i vacc13 = _mm_shuffle_epi32(vacc0123, _MM_SHUFFLE(3, 3, 1, 1));
 
@@ -612,23 +612,21 @@
 
         const __m128i vq31prod0123 = _mm_blend_epi16(vq31prod02, vq31prod13, 0xCC);
 
-        const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+        const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
         const __m128i vrem0123 =
           _mm_add_epi32(_mm_and_si128(vq31prod0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0123));
 
-        const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-        const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+        const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+        const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
         vacc0123 =
           _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
 
-        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
         __m128i vout0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc0123), voutput_zero_point);
 
-        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-        vout0123 = _mm_min_epi16(_mm_max_epi16(vout0123, voutput_min), voutput_max);
-
         vout0123 = _mm_packs_epi16(vout0123, vout0123);
+        vout0123 = _mm_max_epi8(vout0123, _mm_load_si128((const __m128i*) params->sse4.output_min));
+        vout0123 = _mm_min_epi8(vout0123, _mm_load_si128((const __m128i*) params->sse4.output_max));
 
         if XNN_LIKELY(c >= 4) {
           _mm_storeu_si32(output, vout0123);
diff --git a/src/qs8-dwconv/gen/up8x9-minmax-avx-mul16.c b/src/qs8-dwconv/gen/up8x9-minmax-avx-mul16.c
index 0b9ee4f..7f4f9e7 100644
--- a/src/qs8-dwconv/gen/up8x9-minmax-avx-mul16.c
+++ b/src/qs8-dwconv/gen/up8x9-minmax-avx-mul16.c
@@ -203,8 +203,8 @@
 
       w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 72 * sizeof(int8_t));
 
-      const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-      const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+      const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+      const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
       const __m128i vacc13 = _mm_shuffle_epi32(vacc0123, _MM_SHUFFLE(3, 3, 1, 1));
       const __m128i vprod02 = _mm_add_epi64(_mm_mul_epi32(vacc0123, vmultiplier), vrounding);
@@ -221,28 +221,31 @@
       const __m128i vq31prod0123 = _mm_blend_epi16(vq31prod02, vq31prod13, 0xCC);
       const __m128i vq31prod4567 = _mm_blend_epi16(vq31prod46, vq31prod57, 0xCC);
 
-      const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+      const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
       const __m128i vrem0123 =
         _mm_add_epi32(_mm_and_si128(vq31prod0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0123));
       const __m128i vrem4567 =
         _mm_add_epi32(_mm_and_si128(vq31prod4567, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod4567));
 
-      const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-      const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+      const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+      const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
       vacc0123 =
         _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
       vacc4567 =
         _mm_sub_epi32(_mm_sra_epi32(vq31prod4567, vshift), _mm_cmpgt_epi32(vrem4567, vremainder_threshold));
 
-      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
 
-      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-      vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
 
       __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
 
+      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4.output_min);
+      vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
+
+      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse4.output_max);
+      vout0123456701234567 = _mm_min_epi8(vout0123456701234567, voutput_max);
+
       _mm_storel_epi64((__m128i*) output, vout0123456701234567);
       output += 8;
     }
@@ -361,8 +364,8 @@
         vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vp8x01234567lo, vp8x01234567hi));
 
 
-        const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-        const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+        const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+        const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
         const __m128i vacc13 = _mm_shuffle_epi32(vacc0123, _MM_SHUFFLE(3, 3, 1, 1));
         const __m128i vacc57 = _mm_shuffle_epi32(vacc4567, _MM_SHUFFLE(3, 3, 1, 1));
@@ -381,28 +384,28 @@
         const __m128i vq31prod0123 = _mm_blend_epi16(vq31prod02, vq31prod13, 0xCC);
         const __m128i vq31prod4567 = _mm_blend_epi16(vq31prod46, vq31prod57, 0xCC);
 
-        const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+        const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
         const __m128i vrem0123 =
           _mm_add_epi32(_mm_and_si128(vq31prod0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0123));
         const __m128i vrem4567 =
           _mm_add_epi32(_mm_and_si128(vq31prod4567, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod4567));
 
-        const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-        const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+        const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+        const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
         vacc0123 =
           _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
         vacc4567 =
           _mm_sub_epi32(_mm_sra_epi32(vq31prod4567, vshift), _mm_cmpgt_epi32(vrem4567, vremainder_threshold));
 
-        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
         __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
 
-        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-        vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
 
         __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
 
+        vout0123456701234567 = _mm_max_epi8(vout0123456701234567, _mm_load_si128((const __m128i*) params->sse4.output_min));
+        vout0123456701234567 = _mm_min_epi8(vout0123456701234567, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
         if (c & 4) {
           *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
           vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
diff --git a/src/qs8-dwconv/gen/up8x9-minmax-avx-mul32.c b/src/qs8-dwconv/gen/up8x9-minmax-avx-mul32.c
index 11db437..20ae162 100644
--- a/src/qs8-dwconv/gen/up8x9-minmax-avx-mul32.c
+++ b/src/qs8-dwconv/gen/up8x9-minmax-avx-mul32.c
@@ -168,8 +168,8 @@
 
       w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 72 * sizeof(int8_t));
 
-      const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-      const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+      const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+      const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
       const __m128i vacc13 = _mm_shuffle_epi32(vacc0123, _MM_SHUFFLE(3, 3, 1, 1));
       const __m128i vprod02 = _mm_add_epi64(_mm_mul_epi32(vacc0123, vmultiplier), vrounding);
@@ -186,27 +186,27 @@
       const __m128i vq31prod0123 = _mm_blend_epi16(vq31prod02, vq31prod13, 0xCC);
       const __m128i vq31prod4567 = _mm_blend_epi16(vq31prod46, vq31prod57, 0xCC);
 
-      const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+      const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
       const __m128i vrem0123 =
         _mm_add_epi32(_mm_and_si128(vq31prod0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0123));
       const __m128i vrem4567 =
         _mm_add_epi32(_mm_and_si128(vq31prod4567, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod4567));
 
-      const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-      const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+      const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+      const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
       vacc0123 =
         _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
       vacc4567 =
         _mm_sub_epi32(_mm_sra_epi32(vq31prod4567, vshift), _mm_cmpgt_epi32(vrem4567, vremainder_threshold));
 
-      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
 
-      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-      vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
-
+      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4.output_min);
+      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse4.output_max);
       __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
+      vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
+      vout0123456701234567 = _mm_min_epi8(vout0123456701234567, voutput_max);
 
       _mm_storel_epi64((__m128i*) output, vout0123456701234567);
       output += 8;
@@ -274,8 +274,8 @@
         w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
         k += 4;
 
-        const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-        const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+        const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+        const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
         const __m128i vacc13 = _mm_shuffle_epi32(vacc0123, _MM_SHUFFLE(3, 3, 1, 1));
 
@@ -287,23 +287,21 @@
 
         const __m128i vq31prod0123 = _mm_blend_epi16(vq31prod02, vq31prod13, 0xCC);
 
-        const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+        const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
         const __m128i vrem0123 =
           _mm_add_epi32(_mm_and_si128(vq31prod0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0123));
 
-        const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-        const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+        const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+        const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
         vacc0123 =
           _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
 
-        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
         __m128i vout0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc0123), voutput_zero_point);
 
-        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-        vout0123 = _mm_min_epi16(_mm_max_epi16(vout0123, voutput_min), voutput_max);
-
         vout0123 = _mm_packs_epi16(vout0123, vout0123);
+        vout0123 = _mm_max_epi8(vout0123, _mm_load_si128((const __m128i*) params->sse4.output_min));
+        vout0123 = _mm_min_epi8(vout0123, _mm_load_si128((const __m128i*) params->sse4.output_max));
 
         if XNN_LIKELY(c >= 4) {
           _mm_storeu_si32(output, vout0123);
diff --git a/src/qs8-dwconv/gen/up8x9-minmax-avx2-mul32.c b/src/qs8-dwconv/gen/up8x9-minmax-avx2-mul32.c
index 3e82ebf..fdebd15 100644
--- a/src/qs8-dwconv/gen/up8x9-minmax-avx2-mul32.c
+++ b/src/qs8-dwconv/gen/up8x9-minmax-avx2-mul32.c
@@ -139,8 +139,8 @@
 
       w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 72 * sizeof(int8_t));
 
-      const __m256i vmultiplier = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.multiplier));
-      const __m256i vrounding = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.rounding));
+      const __m256i vmultiplier = _mm256_load_si256((const __m256i*) params->avx2.multiplier);
+      const __m256i vrounding = _mm256_load_si256((const __m256i*) params->avx2.rounding);
 
       const __m256i vacc1357 = _mm256_shuffle_epi32(vacc01234567, _MM_SHUFFLE(3, 3, 1, 1));
 
@@ -152,24 +152,25 @@
 
       const __m256i vq31prod01234567 = _mm256_blend_epi16(vq31prod0246, vq31prod1357, 0xCC);
 
-      const __m256i vremainder_mask = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_mask));
+      const __m256i vremainder_mask = _mm256_load_si256((const __m256i*) params->avx2.remainder_mask);
       const __m256i vrem01234567 =
         _mm256_add_epi32(_mm256_and_si256(vq31prod01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod01234567));
 
-      const __m256i vremainder_threshold = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_threshold));
-      const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+      const __m256i vremainder_threshold = _mm256_load_si256((const __m256i*) params->avx2.remainder_threshold);
+      const __m128i vshift = _mm_load_si128((const __m128i*) params->avx2.shift);
       vacc01234567 =
         _mm256_sub_epi32(_mm256_sra_epi32(vq31prod01234567, vshift), _mm256_cmpgt_epi32(vrem01234567, vremainder_threshold));
 
-      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->avx2.output_zero_point);
       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extracti128_si256(vacc01234567, 1)), voutput_zero_point);
 
-      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-      vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
-
       __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
 
+      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx2.output_min);
+      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->avx2.output_max);
+      vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
+      vout0123456701234567 = _mm_min_epi8(vout0123456701234567, voutput_max);
+
       _mm_storel_epi64((__m128i*) output, vout0123456701234567);
       output += 8;
     }
@@ -224,8 +225,8 @@
         vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vi8x01234567, vk8x01234567));
 
 
-        const __m256i vmultiplier = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.multiplier));
-        const __m256i vrounding = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.rounding));
+        const __m256i vmultiplier = _mm256_load_si256((const __m256i*) params->avx2.multiplier);
+        const __m256i vrounding = _mm256_load_si256((const __m256i*) params->avx2.rounding);
 
         const __m256i vacc1357 = _mm256_shuffle_epi32(vacc01234567, _MM_SHUFFLE(3, 3, 1, 1));
 
@@ -237,23 +238,23 @@
 
         const __m256i vq31prod01234567 = _mm256_blend_epi16(vq31prod0246, vq31prod1357, 0xCC);
 
-        const __m256i vremainder_mask = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_mask));
+        const __m256i vremainder_mask = _mm256_load_si256((const __m256i*) params->avx2.remainder_mask);
         const __m256i vrem01234567 =
           _mm256_add_epi32(_mm256_and_si256(vq31prod01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod01234567));
 
-        const __m256i vremainder_threshold = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_threshold));
-        const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+        const __m256i vremainder_threshold = _mm256_load_si256((const __m256i*) params->avx2.remainder_threshold);
+        const __m128i vshift = _mm_load_si128((const __m128i*) params->avx2.shift);
         vacc01234567 =
           _mm256_sub_epi32(_mm256_sra_epi32(vq31prod01234567, vshift), _mm256_cmpgt_epi32(vrem01234567, vremainder_threshold));
 
-        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->avx2.output_zero_point);
         __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extracti128_si256(vacc01234567, 1)), voutput_zero_point);
 
-        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-        vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
-
         __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
+        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->avx2.output_max);
+        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx2.output_min);
+        vout0123456701234567 = _mm_min_epi8(vout0123456701234567, voutput_max);
+        vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
 
         if (c & 4) {
           *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
diff --git a/src/qs8-dwconv/gen/up8x9-minmax-sse2-mul16.c b/src/qs8-dwconv/gen/up8x9-minmax-sse2-mul16.c
index 554a00a..d9bb1c1 100644
--- a/src/qs8-dwconv/gen/up8x9-minmax-sse2-mul16.c
+++ b/src/qs8-dwconv/gen/up8x9-minmax-sse2-mul16.c
@@ -249,7 +249,7 @@
         _mm_add_epi32(_mm_and_si128(vq31prod4567, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod4567));
 
       const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-      const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+      const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse2.shift);
       vacc0123 =
         _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
       vacc4567 =
@@ -259,11 +259,14 @@
       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
 
       const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
+      vout01234567 = _mm_max_epi16(vout01234567, voutput_min);
+
       const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-      vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
+      vout01234567 = _mm_min_epi16(vout01234567, voutput_max);
 
       __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
 
+
       _mm_storel_epi64((__m128i*) output, vout0123456701234567);
       output += 8;
     }
@@ -429,7 +432,7 @@
           _mm_add_epi32(_mm_and_si128(vq31prod4567, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod4567));
 
         const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-        const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+        const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse2.shift);
         vacc0123 =
           _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
         vacc4567 =
@@ -438,12 +441,12 @@
         const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
         __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
 
-        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-        vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
+        vout01234567 = _mm_max_epi16(vout01234567, _mm_load_si128((const __m128i*) params->sse2.output_min));
+        vout01234567 = _mm_min_epi16(vout01234567, _mm_load_si128((const __m128i*) params->sse2.output_max));
 
         __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
 
+
         if (c & 4) {
           *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
           vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
diff --git a/src/qs8-dwconv/gen/up8x9-minmax-sse41-mul16.c b/src/qs8-dwconv/gen/up8x9-minmax-sse41-mul16.c
index 38d707a..0a422aa 100644
--- a/src/qs8-dwconv/gen/up8x9-minmax-sse41-mul16.c
+++ b/src/qs8-dwconv/gen/up8x9-minmax-sse41-mul16.c
@@ -203,8 +203,8 @@
 
       w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 72 * sizeof(int8_t));
 
-      const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-      const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+      const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+      const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
       const __m128i vacc13 = _mm_shuffle_epi32(vacc0123, _MM_SHUFFLE(3, 3, 1, 1));
       const __m128i vprod02 = _mm_add_epi64(_mm_mul_epi32(vacc0123, vmultiplier), vrounding);
@@ -221,28 +221,31 @@
       const __m128i vq31prod0123 = _mm_blend_epi16(vq31prod02, vq31prod13, 0xCC);
       const __m128i vq31prod4567 = _mm_blend_epi16(vq31prod46, vq31prod57, 0xCC);
 
-      const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+      const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
       const __m128i vrem0123 =
         _mm_add_epi32(_mm_and_si128(vq31prod0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0123));
       const __m128i vrem4567 =
         _mm_add_epi32(_mm_and_si128(vq31prod4567, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod4567));
 
-      const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-      const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+      const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+      const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
       vacc0123 =
         _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
       vacc4567 =
         _mm_sub_epi32(_mm_sra_epi32(vq31prod4567, vshift), _mm_cmpgt_epi32(vrem4567, vremainder_threshold));
 
-      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
 
-      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-      vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
 
       __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
 
+      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4.output_min);
+      vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
+
+      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse4.output_max);
+      vout0123456701234567 = _mm_min_epi8(vout0123456701234567, voutput_max);
+
       _mm_storel_epi64((__m128i*) output, vout0123456701234567);
       output += 8;
     }
@@ -361,8 +364,8 @@
         vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vp8x01234567lo, vp8x01234567hi));
 
 
-        const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-        const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+        const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+        const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
         const __m128i vacc13 = _mm_shuffle_epi32(vacc0123, _MM_SHUFFLE(3, 3, 1, 1));
         const __m128i vacc57 = _mm_shuffle_epi32(vacc4567, _MM_SHUFFLE(3, 3, 1, 1));
@@ -381,28 +384,28 @@
         const __m128i vq31prod0123 = _mm_blend_epi16(vq31prod02, vq31prod13, 0xCC);
         const __m128i vq31prod4567 = _mm_blend_epi16(vq31prod46, vq31prod57, 0xCC);
 
-        const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+        const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
         const __m128i vrem0123 =
           _mm_add_epi32(_mm_and_si128(vq31prod0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0123));
         const __m128i vrem4567 =
           _mm_add_epi32(_mm_and_si128(vq31prod4567, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod4567));
 
-        const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-        const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+        const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+        const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
         vacc0123 =
           _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
         vacc4567 =
           _mm_sub_epi32(_mm_sra_epi32(vq31prod4567, vshift), _mm_cmpgt_epi32(vrem4567, vremainder_threshold));
 
-        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
         __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
 
-        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-        vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
 
         __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
 
+        vout0123456701234567 = _mm_max_epi8(vout0123456701234567, _mm_load_si128((const __m128i*) params->sse4.output_min));
+        vout0123456701234567 = _mm_min_epi8(vout0123456701234567, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
         if (c & 4) {
           *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
           vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
diff --git a/src/qs8-dwconv/gen/up8x9-minmax-sse41-mul32.c b/src/qs8-dwconv/gen/up8x9-minmax-sse41-mul32.c
index 1ca6748..4cb309e 100644
--- a/src/qs8-dwconv/gen/up8x9-minmax-sse41-mul32.c
+++ b/src/qs8-dwconv/gen/up8x9-minmax-sse41-mul32.c
@@ -168,8 +168,8 @@
 
       w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 72 * sizeof(int8_t));
 
-      const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-      const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+      const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+      const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
       const __m128i vacc13 = _mm_shuffle_epi32(vacc0123, _MM_SHUFFLE(3, 3, 1, 1));
       const __m128i vprod02 = _mm_add_epi64(_mm_mul_epi32(vacc0123, vmultiplier), vrounding);
@@ -186,27 +186,27 @@
       const __m128i vq31prod0123 = _mm_blend_epi16(vq31prod02, vq31prod13, 0xCC);
       const __m128i vq31prod4567 = _mm_blend_epi16(vq31prod46, vq31prod57, 0xCC);
 
-      const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+      const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
       const __m128i vrem0123 =
         _mm_add_epi32(_mm_and_si128(vq31prod0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0123));
       const __m128i vrem4567 =
         _mm_add_epi32(_mm_and_si128(vq31prod4567, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod4567));
 
-      const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-      const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+      const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+      const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
       vacc0123 =
         _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
       vacc4567 =
         _mm_sub_epi32(_mm_sra_epi32(vq31prod4567, vshift), _mm_cmpgt_epi32(vrem4567, vremainder_threshold));
 
-      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
 
-      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-      vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
-
+      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4.output_min);
+      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse4.output_max);
       __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
+      vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
+      vout0123456701234567 = _mm_min_epi8(vout0123456701234567, voutput_max);
 
       _mm_storel_epi64((__m128i*) output, vout0123456701234567);
       output += 8;
@@ -274,8 +274,8 @@
         w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
         k += 4;
 
-        const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-        const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+        const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+        const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
         const __m128i vacc13 = _mm_shuffle_epi32(vacc0123, _MM_SHUFFLE(3, 3, 1, 1));
 
@@ -287,23 +287,21 @@
 
         const __m128i vq31prod0123 = _mm_blend_epi16(vq31prod02, vq31prod13, 0xCC);
 
-        const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+        const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
         const __m128i vrem0123 =
           _mm_add_epi32(_mm_and_si128(vq31prod0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0123));
 
-        const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-        const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+        const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+        const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
         vacc0123 =
           _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
 
-        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
         __m128i vout0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc0123), voutput_zero_point);
 
-        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-        vout0123 = _mm_min_epi16(_mm_max_epi16(vout0123, voutput_min), voutput_max);
-
         vout0123 = _mm_packs_epi16(vout0123, vout0123);
+        vout0123 = _mm_max_epi8(vout0123, _mm_load_si128((const __m128i*) params->sse4.output_min));
+        vout0123 = _mm_min_epi8(vout0123, _mm_load_si128((const __m128i*) params->sse4.output_max));
 
         if XNN_LIKELY(c >= 4) {
           _mm_storeu_si32(output, vout0123);
diff --git a/src/qs8-dwconv/gen/up8x9-minmax-ssse3-mul16.c b/src/qs8-dwconv/gen/up8x9-minmax-ssse3-mul16.c
index f21866a..b1f1651 100644
--- a/src/qs8-dwconv/gen/up8x9-minmax-ssse3-mul16.c
+++ b/src/qs8-dwconv/gen/up8x9-minmax-ssse3-mul16.c
@@ -249,7 +249,7 @@
         _mm_add_epi32(_mm_and_si128(vq31prod4567, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod4567));
 
       const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-      const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+      const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse2.shift);
       vacc0123 =
         _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
       vacc4567 =
@@ -259,11 +259,14 @@
       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
 
       const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
+      vout01234567 = _mm_max_epi16(vout01234567, voutput_min);
+
       const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-      vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
+      vout01234567 = _mm_min_epi16(vout01234567, voutput_max);
 
       __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
 
+
       _mm_storel_epi64((__m128i*) output, vout0123456701234567);
       output += 8;
     }
@@ -429,7 +432,7 @@
           _mm_add_epi32(_mm_and_si128(vq31prod4567, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod4567));
 
         const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-        const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+        const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse2.shift);
         vacc0123 =
           _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
         vacc4567 =
@@ -438,12 +441,12 @@
         const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
         __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
 
-        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-        vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
+        vout01234567 = _mm_max_epi16(vout01234567, _mm_load_si128((const __m128i*) params->sse2.output_min));
+        vout01234567 = _mm_min_epi16(vout01234567, _mm_load_si128((const __m128i*) params->sse2.output_max));
 
         __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
 
+
         if (c & 4) {
           *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
           vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
diff --git a/src/qs8-dwconv/gen/up8x9-minmax-xop-mul32.c b/src/qs8-dwconv/gen/up8x9-minmax-xop-mul32.c
index cb711f2..f3d8308 100644
--- a/src/qs8-dwconv/gen/up8x9-minmax-xop-mul32.c
+++ b/src/qs8-dwconv/gen/up8x9-minmax-xop-mul32.c
@@ -173,8 +173,8 @@
 
       w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 72 * sizeof(int8_t));
 
-      const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-      const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+      const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+      const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
       const __m128i vacc13 = _mm_shuffle_epi32(vacc0123, _MM_SHUFFLE(3, 3, 1, 1));
       const __m128i vprod02 = _mm_add_epi64(_mm_mul_epi32(vacc0123, vmultiplier), vrounding);
@@ -191,27 +191,27 @@
       const __m128i vq31prod0123 = _mm_blend_epi16(vq31prod02, vq31prod13, 0xCC);
       const __m128i vq31prod4567 = _mm_blend_epi16(vq31prod46, vq31prod57, 0xCC);
 
-      const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+      const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
       const __m128i vrem0123 =
         _mm_add_epi32(_mm_and_si128(vq31prod0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0123));
       const __m128i vrem4567 =
         _mm_add_epi32(_mm_and_si128(vq31prod4567, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod4567));
 
-      const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-      const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+      const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+      const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
       vacc0123 =
         _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
       vacc4567 =
         _mm_sub_epi32(_mm_sra_epi32(vq31prod4567, vshift), _mm_cmpgt_epi32(vrem4567, vremainder_threshold));
 
-      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
 
-      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-      vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
-
+      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4.output_min);
+      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse4.output_max);
       __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
+      vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
+      vout0123456701234567 = _mm_min_epi8(vout0123456701234567, voutput_max);
 
       _mm_storel_epi64((__m128i*) output, vout0123456701234567);
       output += 8;
@@ -279,8 +279,8 @@
         w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
         k += 4;
 
-        const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-        const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+        const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+        const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
         const __m128i vacc13 = _mm_shuffle_epi32(vacc0123, _MM_SHUFFLE(3, 3, 1, 1));
 
@@ -292,23 +292,21 @@
 
         const __m128i vq31prod0123 = _mm_blend_epi16(vq31prod02, vq31prod13, 0xCC);
 
-        const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+        const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
         const __m128i vrem0123 =
           _mm_add_epi32(_mm_and_si128(vq31prod0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0123));
 
-        const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-        const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+        const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+        const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
         vacc0123 =
           _mm_sub_epi32(_mm_sra_epi32(vq31prod0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
 
-        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
         __m128i vout0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc0123), voutput_zero_point);
 
-        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-        vout0123 = _mm_min_epi16(_mm_max_epi16(vout0123, voutput_min), voutput_max);
-
         vout0123 = _mm_packs_epi16(vout0123, vout0123);
+        vout0123 = _mm_max_epi8(vout0123, _mm_load_si128((const __m128i*) params->sse4.output_min));
+        vout0123 = _mm_min_epi8(vout0123, _mm_load_si128((const __m128i*) params->sse4.output_max));
 
         if XNN_LIKELY(c >= 4) {
           _mm_storeu_si32(output, vout0123);
diff --git a/src/qs8-dwconv/unipass-avx2-mul16.c.in b/src/qs8-dwconv/unipass-avx2-mul16.c.in
index fb2be5e..f61650f 100644
--- a/src/qs8-dwconv/unipass-avx2-mul16.c.in
+++ b/src/qs8-dwconv/unipass-avx2-mul16.c.in
@@ -63,8 +63,8 @@
 
       w = (const void*) ((uintptr_t) w + ${CHANNEL_TILE} * sizeof(int32_t) + ${KERNEL_TILE * CHANNEL_TILE} * sizeof(int8_t));
 
-      const __m256i vmultiplier = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.multiplier));
-      const __m256i vrounding = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.rounding));
+      const __m256i vmultiplier = _mm256_load_si256((const __m256i*) params->avx2.multiplier);
+      const __m256i vrounding = _mm256_load_si256((const __m256i*) params->avx2.rounding);
 
       $for C in range(0, CHANNEL_TILE, 8):
         const __m256i vacc${ABC[C+1:C+8:2]} = _mm256_shuffle_epi32(vacc${ABC[C:C+8]}, _MM_SHUFFLE(3, 3, 1, 1));
@@ -80,29 +80,33 @@
       $for C in range(0, CHANNEL_TILE, 8):
         const __m256i vq31prod${ABC[C:C+8]} = _mm256_blend_epi16(vq31prod${ABC[C:C+8:2]}, vq31prod${ABC[C+1:C+8:2]}, 0xCC);
 
-      const __m256i vremainder_mask = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_mask));
+      const __m256i vremainder_mask = _mm256_load_si256((const __m256i*) params->avx2.remainder_mask);
       $for C in range(0, CHANNEL_TILE, 8):
         const __m256i vrem${ABC[C:C+8]} =
           _mm256_add_epi32(_mm256_and_si256(vq31prod${ABC[C:C+8]}, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod${ABC[C:C+8]}));
 
-      const __m256i vremainder_threshold = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_threshold));
-      const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+      const __m256i vremainder_threshold = _mm256_load_si256((const __m256i*) params->avx2.remainder_threshold);
+      $if CHANNEL_TILE > 8:
+        const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->avx2.shift);
+      $else:
+        const __m128i vshift = _mm_load_si128((const __m128i*) params->avx2.shift);
       $for C in range(0, CHANNEL_TILE, 8):
         vacc${ABC[C:C+8]} =
           _mm256_sub_epi32(_mm256_sra_epi32(vq31prod${ABC[C:C+8]}, vshift), _mm256_cmpgt_epi32(vrem${ABC[C:C+8]}, vremainder_threshold));
 
-      const __m256i voutput_zero_point = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_zero_point));
+      const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx2.output_zero_point);
       $for C in range(0, CHANNEL_TILE, 16):
-        __m256i vout${ABC[C:C+4]}${ABC[C+8:C+12]}${ABC[C+4:C+8]}${ABC[C+12:C+16]} = _mm256_adds_epi16(_mm256_packs_epi32(vacc${ABC[C:C+8]}, vacc${ABC[C+8:C+16]}), voutput_zero_point);
-
-      const __m256i voutput_min = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_min));
-      const __m256i voutput_max = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_max));
-      $for C in range(0, CHANNEL_TILE, 16):
-        vout${ABC[C:C+4]}${ABC[C+8:C+12]}${ABC[C+4:C+8]}${ABC[C+12:C+16]} = _mm256_min_epi16(_mm256_max_epi16(vout${ABC[C:C+4]}${ABC[C+8:C+12]}${ABC[C+4:C+8]}${ABC[C+12:C+16]}, voutput_min), voutput_max);
+        const __m256i vout${ABC[C:C+4]}${ABC[C+8:C+12]}${ABC[C+4:C+8]}${ABC[C+12:C+16]} = _mm256_adds_epi16(_mm256_packs_epi32(vacc${ABC[C:C+8]}, vacc${ABC[C+8:C+16]}), voutput_zero_point);
 
       $for C in range(0, CHANNEL_TILE, 16):
         __m128i vout${ABC[C:C+16]} = _mm_shuffle_epi32(_mm_packs_epi16(_mm256_castsi256_si128(vout${ABC[C:C+4]}${ABC[C+8:C+12]}${ABC[C+4:C+8]}${ABC[C+12:C+16]}), _mm256_extracti128_si256(vout${ABC[C:C+4]}${ABC[C+8:C+12]}${ABC[C+4:C+8]}${ABC[C+12:C+16]}, 1)), _MM_SHUFFLE(3, 1, 2, 0));
 
+      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx2.output_min);
+      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->avx2.output_max);
+      $for C in range(0, CHANNEL_TILE, 16):
+        vout${ABC[C:C+16]} = _mm_max_epi8(vout${ABC[C:C+16]}, voutput_min);
+        vout${ABC[C:C+16]} = _mm_min_epi8(vout${ABC[C:C+16]}, voutput_max);
+
       _mm_storeu_si128((__m128i*) output, vout${ABC[0:16]});
       $for C in range(16, CHANNEL_TILE, 16):
         _mm_storeu_si128((__m128i*) (output + ${C}), vout${ABC[C:C+16]});
@@ -137,8 +141,8 @@
           w = (const void*) ((uintptr_t) w + 16 * sizeof(int32_t));
           k += 16;
 
-        const __m256i vmultiplier = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.multiplier));
-        const __m256i vrounding = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.rounding));
+        const __m256i vmultiplier = _mm256_load_si256((const __m256i*) params->avx2.multiplier);
+        const __m256i vrounding = _mm256_load_si256((const __m256i*) params->avx2.rounding);
 
         const __m256i vacc${ABC[1:8:2]} = _mm256_shuffle_epi32(vacc${ABC[0:8]}, _MM_SHUFFLE(3, 3, 1, 1));
         const __m256i vacc${ABC[9:16:2]} = _mm256_shuffle_epi32(vacc${ABC[8:16]}, _MM_SHUFFLE(3, 3, 1, 1));
@@ -156,29 +160,28 @@
         const __m256i vq31prod${ABC[0:8]} = _mm256_blend_epi16(vq31prod${ABC[0:8:2]}, vq31prod${ABC[1:8:2]}, 0xCC);
         const __m256i vq31prod${ABC[8:16]} = _mm256_blend_epi16(vq31prod${ABC[8:16:2]}, vq31prod${ABC[9:16:2]}, 0xCC);
 
-        const __m256i vremainder_mask = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_mask));
+        const __m256i vremainder_mask = _mm256_load_si256((const __m256i*) params->avx2.remainder_mask);
         const __m256i vrem${ABC[0:8]} =
           _mm256_add_epi32(_mm256_and_si256(vq31prod${ABC[0:8]}, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod${ABC[0:8]}));
         const __m256i vrem${ABC[8:16]} =
           _mm256_add_epi32(_mm256_and_si256(vq31prod${ABC[8:16]}, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod${ABC[8:16]}));
 
-        const __m256i vremainder_threshold = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_threshold));
-        const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+        const __m256i vremainder_threshold = _mm256_load_si256((const __m256i*) params->avx2.remainder_threshold);
+        const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->avx2.shift);
         vacc${ABC[0:8]} =
           _mm256_sub_epi32(_mm256_sra_epi32(vq31prod${ABC[0:8]}, vshift), _mm256_cmpgt_epi32(vrem${ABC[0:8]}, vremainder_threshold));
         vacc${ABC[8:16]} =
           _mm256_sub_epi32(_mm256_sra_epi32(vq31prod${ABC[8:16]}, vshift), _mm256_cmpgt_epi32(vrem${ABC[8:16]}, vremainder_threshold));
 
-        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->avx2.output_zero_point);
         __m128i vout${ABC[0:8]} = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc${ABC[0:8]}), _mm256_extracti128_si256(vacc${ABC[0:8]}, 1)), voutput_zero_point);
         __m128i vout${ABC[8:16]} = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc${ABC[8:16]}), _mm256_extracti128_si256(vacc${ABC[8:16]}, 1)), voutput_zero_point);
 
-        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-        vout${ABC[0:8]} = _mm_min_epi16(_mm_max_epi16(vout${ABC[0:8]}, voutput_min), voutput_max);
-        vout${ABC[8:16]} = _mm_min_epi16(_mm_max_epi16(vout${ABC[8:16]}, voutput_min), voutput_max);
+        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx2.output_min);
+        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->avx2.output_max);
 
         __m128i vout${ABC[0:16]} = _mm_packs_epi16(vout${ABC[0:8]}, vout${ABC[8:16]});
+        vout${ABC[0:16]} = _mm_min_epi8(_mm_max_epi8(vout${ABC[0:16]}, voutput_min), voutput_max);
 
         $if CHANNEL_TILE > 16:
           if XNN_LIKELY(c >= 16) {
diff --git a/src/qs8-dwconv/unipass-avx2-mul32.c.in b/src/qs8-dwconv/unipass-avx2-mul32.c.in
index 41086fd..8d808fc 100644
--- a/src/qs8-dwconv/unipass-avx2-mul32.c.in
+++ b/src/qs8-dwconv/unipass-avx2-mul32.c.in
@@ -60,8 +60,8 @@
 
       w = (const void*) ((uintptr_t) w + ${CHANNEL_TILE} * sizeof(int32_t) + ${KERNEL_TILE * CHANNEL_TILE} * sizeof(int8_t));
 
-      const __m256i vmultiplier = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.multiplier));
-      const __m256i vrounding = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.rounding));
+      const __m256i vmultiplier = _mm256_load_si256((const __m256i*) params->avx2.multiplier);
+      const __m256i vrounding = _mm256_load_si256((const __m256i*) params->avx2.rounding);
 
       $for C in range(0, CHANNEL_TILE, 8):
         const __m256i vacc${ABC[C+1:C+8:2]} = _mm256_shuffle_epi32(vacc${ABC[C:C+8]}, _MM_SHUFFLE(3, 3, 1, 1));
@@ -77,21 +77,24 @@
       $for C in range(0, CHANNEL_TILE, 8):
         const __m256i vq31prod${ABC[C:C+8]} = _mm256_blend_epi16(vq31prod${ABC[C:C+8:2]}, vq31prod${ABC[C+1:C+8:2]}, 0xCC);
 
-      const __m256i vremainder_mask = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_mask));
+      const __m256i vremainder_mask = _mm256_load_si256((const __m256i*) params->avx2.remainder_mask);
       $for C in range(0, CHANNEL_TILE, 8):
         const __m256i vrem${ABC[C:C+8]} =
           _mm256_add_epi32(_mm256_and_si256(vq31prod${ABC[C:C+8]}, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod${ABC[C:C+8]}));
 
-      const __m256i vremainder_threshold = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_threshold));
-      const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+      const __m256i vremainder_threshold = _mm256_load_si256((const __m256i*) params->avx2.remainder_threshold);
+      $if CHANNEL_TILE > 8:
+        const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->avx2.shift);
+      $else:
+        const __m128i vshift = _mm_load_si128((const __m128i*) params->avx2.shift);
       $for C in range(0, CHANNEL_TILE, 8):
         vacc${ABC[C:C+8]} =
           _mm256_sub_epi32(_mm256_sra_epi32(vq31prod${ABC[C:C+8]}, vshift), _mm256_cmpgt_epi32(vrem${ABC[C:C+8]}, vremainder_threshold));
 
       $if CHANNEL_TILE > 8:
-        const __m256i voutput_zero_point = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_zero_point));
+        const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx2.output_zero_point);
       $else:
-        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->avx2.output_zero_point);
       $for C in range(0, CHANNEL_TILE, 16):
         $if C + 8 < CHANNEL_TILE:
           __m256i vout${ABC[C:C+4]}${ABC[C+8:C+12]}${ABC[C+4:C+8]}${ABC[C+12:C+16]} = _mm256_adds_epi16(_mm256_packs_epi32(vacc${ABC[C:C+8]}, vacc${ABC[C+8:C+16]}), voutput_zero_point);
@@ -100,26 +103,22 @@
         $else:
           __m128i vout${ABC[C:C+8]} = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc${ABC[C:C+8]}), _mm256_extracti128_si256(vacc${ABC[C:C+8]}, 1)), voutput_zero_point);
 
-      $if CHANNEL_TILE > 8:
-        const __m256i voutput_min = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_min));
-        const __m256i voutput_max = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_max));
-      $else:
-        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-      $for C in range(0, CHANNEL_TILE, 16):
-        $if C + 8 < CHANNEL_TILE:
-          vout${ABC[C:C+4]}${ABC[C+8:C+12]}${ABC[C+4:C+8]}${ABC[C+12:C+16]} = _mm256_min_epi16(_mm256_max_epi16(vout${ABC[C:C+4]}${ABC[C+8:C+12]}${ABC[C+4:C+8]}${ABC[C+12:C+16]}, voutput_min), voutput_max);
-        $elif CHANNEL_TILE > 8:
-          vout${ABC[C:C+8]} = _mm_min_epi16(_mm_max_epi16(vout${ABC[C:C+8]}, _mm256_castsi256_si128(voutput_min)), _mm256_castsi256_si128(voutput_max));
-        $else:
-          vout${ABC[C:C+8]} = _mm_min_epi16(_mm_max_epi16(vout${ABC[C:C+8]}, voutput_min), voutput_max);
-
       $for C in range(0, CHANNEL_TILE, 16):
         $if C + 8 < CHANNEL_TILE:
           __m128i vout${ABC[C:C+16]} = _mm_shuffle_epi32(_mm_packs_epi16(_mm256_castsi256_si128(vout${ABC[C:C+4]}${ABC[C+8:C+12]}${ABC[C+4:C+8]}${ABC[C+12:C+16]}), _mm256_extracti128_si256(vout${ABC[C:C+4]}${ABC[C+8:C+12]}${ABC[C+4:C+8]}${ABC[C+12:C+16]}, 1)), _MM_SHUFFLE(3, 1, 2, 0));
         $else:
           __m128i vout${ABC[C:C+8]}${ABC[C:C+8]} = _mm_packs_epi16(vout${ABC[C:C+8]}, vout${ABC[C:C+8]});
 
+      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx2.output_min);
+      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->avx2.output_max);
+      $for C in range(0, CHANNEL_TILE, 16):
+        $if C + 8 < CHANNEL_TILE:
+          vout${ABC[C:C+16]} = _mm_max_epi8(vout${ABC[C:C+16]}, voutput_min);
+          vout${ABC[C:C+16]} = _mm_min_epi8(vout${ABC[C:C+16]}, voutput_max);
+        $else:
+          vout${ABC[C:C+8]}${ABC[C:C+8]} = _mm_max_epi8(vout${ABC[C:C+8]}${ABC[C:C+8]}, voutput_min);
+          vout${ABC[C:C+8]}${ABC[C:C+8]} = _mm_min_epi8(vout${ABC[C:C+8]}${ABC[C:C+8]}, voutput_max);
+
       $if CHANNEL_TILE > 8:
         _mm_storeu_si128((__m128i*) output, vout${ABC[0:16]});
       $else:
@@ -156,8 +155,8 @@
           w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t));
           k += 8;
 
-        const __m256i vmultiplier = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.multiplier));
-        const __m256i vrounding = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.rounding));
+        const __m256i vmultiplier = _mm256_load_si256((const __m256i*) params->avx2.multiplier);
+        const __m256i vrounding = _mm256_load_si256((const __m256i*) params->avx2.rounding);
 
         const __m256i vacc${ABC[1:8:2]} = _mm256_shuffle_epi32(vacc${ABC[0:8]}, _MM_SHUFFLE(3, 3, 1, 1));
 
@@ -169,23 +168,23 @@
 
         const __m256i vq31prod${ABC[0:8]} = _mm256_blend_epi16(vq31prod${ABC[0:8:2]}, vq31prod${ABC[1:8:2]}, 0xCC);
 
-        const __m256i vremainder_mask = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_mask));
+        const __m256i vremainder_mask = _mm256_load_si256((const __m256i*) params->avx2.remainder_mask);
         const __m256i vrem${ABC[0:8]} =
           _mm256_add_epi32(_mm256_and_si256(vq31prod${ABC[0:8]}, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod${ABC[0:8]}));
 
-        const __m256i vremainder_threshold = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_threshold));
-        const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+        const __m256i vremainder_threshold = _mm256_load_si256((const __m256i*) params->avx2.remainder_threshold);
+        const __m128i vshift = _mm_load_si128((const __m128i*) params->avx2.shift);
         vacc${ABC[0:8]} =
           _mm256_sub_epi32(_mm256_sra_epi32(vq31prod${ABC[0:8]}, vshift), _mm256_cmpgt_epi32(vrem${ABC[0:8]}, vremainder_threshold));
 
-        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->avx2.output_zero_point);
         __m128i vout${ABC[0:8]} = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc${ABC[0:8]}), _mm256_extracti128_si256(vacc${ABC[0:8]}, 1)), voutput_zero_point);
 
-        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-        vout${ABC[0:8]} = _mm_min_epi16(_mm_max_epi16(vout${ABC[0:8]}, voutput_min), voutput_max);
-
         __m128i vout${ABC[0:8]}${ABC[0:8]} = _mm_packs_epi16(vout${ABC[0:8]}, vout${ABC[0:8]});
+        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->avx2.output_max);
+        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx2.output_min);
+        vout${ABC[0:8]}${ABC[0:8]} = _mm_min_epi8(vout${ABC[0:8]}${ABC[0:8]}, voutput_max);
+        vout${ABC[0:8]}${ABC[0:8]} = _mm_max_epi8(vout${ABC[0:8]}${ABC[0:8]}, voutput_min);
 
         $if CHANNEL_TILE > 8:
           if XNN_LIKELY(c >= 8) {
diff --git a/src/qs8-dwconv/unipass-sse-mul16.c.in b/src/qs8-dwconv/unipass-sse-mul16.c.in
index d53528b..5036dd7 100644
--- a/src/qs8-dwconv/unipass-sse-mul16.c.in
+++ b/src/qs8-dwconv/unipass-sse-mul16.c.in
@@ -15,6 +15,7 @@
 #include <xnnpack/dwconv.h>
 
 
+$PARAMS_STRUCT = "sse4" if SSE >= 4 else "sse2"
 $ISA = "avx" if AVX else {2: "sse2", 3: "ssse3", 4: "sse41"}[SSE]
 void xnn_qs8_dwconv_minmax_ukernel_up${CHANNEL_TILE}x${KERNEL_TILE}__${ISA}_mul16(
     size_t channels,
@@ -76,8 +77,8 @@
 
       w = (const void*) ((uintptr_t) w + ${CHANNEL_TILE} * sizeof(int32_t) + ${KERNEL_TILE * CHANNEL_TILE} * sizeof(int8_t));
 
-      const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-      const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+      const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.multiplier);
+      const __m128i vrounding = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.rounding);
 
       $if SSE >= 4:
         $for C in range(0, CHANNEL_TILE, 4):
@@ -125,25 +126,29 @@
         $for C in range(0, CHANNEL_TILE, 4):
           const __m128i vq31prod${ABC[C:C+4]} = _mm_shuffle_epi32(vq31prod${ABC[C:C+4:2]}${ABC[C+1:C+4:2]}, _MM_SHUFFLE(3, 1, 2, 0));
 
-      const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+      const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.remainder_mask);
       $for C in range(0, CHANNEL_TILE, 4):
         const __m128i vrem${ABC[C:C+4]} =
           _mm_add_epi32(_mm_and_si128(vq31prod${ABC[C:C+4]}, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod${ABC[C:C+4]}));
 
-      const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-      const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+      const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.remainder_threshold);
+      const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->${PARAMS_STRUCT}.shift);
       $for C in range(0, CHANNEL_TILE, 4):
         vacc${ABC[C:C+4]} =
           _mm_sub_epi32(_mm_sra_epi32(vq31prod${ABC[C:C+4]}, vshift), _mm_cmpgt_epi32(vrem${ABC[C:C+4]}, vremainder_threshold));
 
-      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_zero_point);
       $for C in range(0, CHANNEL_TILE, 8):
         __m128i vout${ABC[C:C+8]} = _mm_adds_epi16(_mm_packs_epi32(vacc${ABC[C:C+4]}, vacc${ABC[C+4:C+8]}), voutput_zero_point);
 
-      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-      $for C in range(0, CHANNEL_TILE, 8):
-        vout${ABC[C:C+8]} = _mm_min_epi16(_mm_max_epi16(vout${ABC[C:C+8]}, voutput_min), voutput_max);
+      $if SSE < 4:
+        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
+        $for C in range(0, CHANNEL_TILE, 8):
+          vout${ABC[C:C+8]} = _mm_max_epi16(vout${ABC[C:C+8]}, voutput_min);
+
+        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
+        $for C in range(0, CHANNEL_TILE, 8):
+          vout${ABC[C:C+8]} = _mm_min_epi16(vout${ABC[C:C+8]}, voutput_max);
 
       $for C in range(0, CHANNEL_TILE, 16):
         $if C + 8 < CHANNEL_TILE:
@@ -151,6 +156,21 @@
         $else:
           __m128i vout${ABC[C:C+8]}${ABC[C:C+8]} = _mm_packs_epi16(vout${ABC[C:C+8]}, vout${ABC[C:C+8]});
 
+      $if SSE == 4:
+        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4.output_min);
+        $for C in range(0, CHANNEL_TILE, 16):
+          $if C + 8 < CHANNEL_TILE:
+            vout${ABC[C:C+16]} = _mm_max_epi8(vout${ABC[C:C+16]}, voutput_min);
+          $else:
+            vout${ABC[C:C+8]}${ABC[C:C+8]} = _mm_max_epi8(vout${ABC[C:C+8]}${ABC[C:C+8]}, voutput_min);
+
+        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse4.output_max);
+        $for C in range(0, CHANNEL_TILE, 16):
+          $if C + 8 < CHANNEL_TILE:
+            vout${ABC[C:C+16]} = _mm_min_epi8(vout${ABC[C:C+16]}, voutput_max);
+          $else:
+            vout${ABC[C:C+8]}${ABC[C:C+8]} = _mm_min_epi8(vout${ABC[C:C+8]}${ABC[C:C+8]}, voutput_max);
+
       $if CHANNEL_TILE > 8:
         _mm_storeu_si128((__m128i*) output, vout${ABC[0:16]});
       $else:
@@ -200,10 +220,10 @@
           w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t));
           k += 8;
 
-        const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-        const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+        const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.multiplier);
+        const __m128i vrounding = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.rounding);
 
-        $if SSE >= 4:
+        $if SSE == 4:
           const __m128i vacc${ABC[1:4:2]} = _mm_shuffle_epi32(vacc${ABC[0:4]}, _MM_SHUFFLE(3, 3, 1, 1));
           const __m128i vacc${ABC[5:8:2]} = _mm_shuffle_epi32(vacc${ABC[4:8]}, _MM_SHUFFLE(3, 3, 1, 1));
 
@@ -262,28 +282,32 @@
           const __m128i vq31prod${ABC[0:4]} = _mm_shuffle_epi32(vq31prod${ABC[0:4:2]}${ABC[1:4:2]}, _MM_SHUFFLE(3, 1, 2, 0));
           const __m128i vq31prod${ABC[4:8]} = _mm_shuffle_epi32(vq31prod${ABC[4:8:2]}${ABC[5:8:2]}, _MM_SHUFFLE(3, 1, 2, 0));
 
-        const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+        const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.remainder_mask);
         const __m128i vrem${ABC[0:4]} =
           _mm_add_epi32(_mm_and_si128(vq31prod${ABC[0:4]}, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod${ABC[0:4]}));
         const __m128i vrem${ABC[4:8]} =
           _mm_add_epi32(_mm_and_si128(vq31prod${ABC[4:8]}, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod${ABC[4:8]}));
 
-        const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-        const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+        const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.remainder_threshold);
+        const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->${PARAMS_STRUCT}.shift);
         vacc${ABC[0:4]} =
           _mm_sub_epi32(_mm_sra_epi32(vq31prod${ABC[0:4]}, vshift), _mm_cmpgt_epi32(vrem${ABC[0:4]}, vremainder_threshold));
         vacc${ABC[4:8]} =
           _mm_sub_epi32(_mm_sra_epi32(vq31prod${ABC[4:8]}, vshift), _mm_cmpgt_epi32(vrem${ABC[4:8]}, vremainder_threshold));
 
-        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_zero_point);
         __m128i vout${ABC[0:8]} = _mm_adds_epi16(_mm_packs_epi32(vacc${ABC[0:4]}, vacc${ABC[4:8]}), voutput_zero_point);
 
-        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-        vout${ABC[0:8]} = _mm_min_epi16(_mm_max_epi16(vout${ABC[0:8]}, voutput_min), voutput_max);
+        $if SSE < 4:
+          vout${ABC[0:8]} = _mm_max_epi16(vout${ABC[0:8]}, _mm_load_si128((const __m128i*) params->sse2.output_min));
+          vout${ABC[0:8]} = _mm_min_epi16(vout${ABC[0:8]}, _mm_load_si128((const __m128i*) params->sse2.output_max));
 
         __m128i vout${ABC[0:8]}${ABC[0:8]} = _mm_packs_epi16(vout${ABC[0:8]}, vout${ABC[0:8]});
 
+        $if SSE == 4:
+          vout${ABC[0:8]}${ABC[0:8]} = _mm_max_epi8(vout${ABC[0:8]}${ABC[0:8]}, _mm_load_si128((const __m128i*) params->sse4.output_min));
+          vout${ABC[0:8]}${ABC[0:8]} = _mm_min_epi8(vout${ABC[0:8]}${ABC[0:8]}, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
         $if CHANNEL_TILE > 8:
           if XNN_LIKELY(c >= 8) {
             _mm_storel_epi64((__m128i*) output, vout${ABC[0:8]}${ABC[0:8]});
diff --git a/src/qs8-dwconv/unipass-sse-mul32.c.in b/src/qs8-dwconv/unipass-sse-mul32.c.in
index 73e1332..739501c 100644
--- a/src/qs8-dwconv/unipass-sse-mul32.c.in
+++ b/src/qs8-dwconv/unipass-sse-mul32.c.in
@@ -75,8 +75,8 @@
 
       w = (const void*) ((uintptr_t) w + ${CHANNEL_TILE} * sizeof(int32_t) + ${KERNEL_TILE * CHANNEL_TILE} * sizeof(int8_t));
 
-      const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-      const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+      const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+      const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
       $for C in range(0, CHANNEL_TILE, 4):
         const __m128i vacc${ABC[C+1:C+4:2]} = _mm_shuffle_epi32(vacc${ABC[C:C+4]}, _MM_SHUFFLE(3, 3, 1, 1));
@@ -90,31 +90,32 @@
       $for C in range(0, CHANNEL_TILE, 4):
         const __m128i vq31prod${ABC[C:C+4]} = _mm_blend_epi16(vq31prod${ABC[C:C+4:2]}, vq31prod${ABC[C+1:C+4:2]}, 0xCC);
 
-      const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+      const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
       $for C in range(0, CHANNEL_TILE, 4):
         const __m128i vrem${ABC[C:C+4]} =
           _mm_add_epi32(_mm_and_si128(vq31prod${ABC[C:C+4]}, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod${ABC[C:C+4]}));
 
-      const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-      const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+      const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+      const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
       $for C in range(0, CHANNEL_TILE, 4):
         vacc${ABC[C:C+4]} =
           _mm_sub_epi32(_mm_sra_epi32(vq31prod${ABC[C:C+4]}, vshift), _mm_cmpgt_epi32(vrem${ABC[C:C+4]}, vremainder_threshold));
 
-      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+      const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
       $for C in range(0, CHANNEL_TILE, 8):
         __m128i vout${ABC[C:C+8]} = _mm_adds_epi16(_mm_packs_epi32(vacc${ABC[C:C+4]}, vacc${ABC[C+4:C+8]}), voutput_zero_point);
 
-      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-      $for C in range(0, CHANNEL_TILE, 8):
-        vout${ABC[C:C+8]} = _mm_min_epi16(_mm_max_epi16(vout${ABC[C:C+8]}, voutput_min), voutput_max);
-
+      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4.output_min);
+      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse4.output_max);
       $for C in range(0, CHANNEL_TILE, 16):
         $if C + 8 < CHANNEL_TILE:
           __m128i vout${ABC[C:C+16]} = _mm_packs_epi16(vout${ABC[C:C+8]}, vout${ABC[C+8:C+16]});
+          vout${ABC[C:C+16]} = _mm_max_epi8(vout${ABC[C:C+16]}, voutput_min);
+          vout${ABC[C:C+16]} = _mm_min_epi8(vout${ABC[C:C+16]}, voutput_max);
         $else:
           __m128i vout${ABC[C:C+8]}${ABC[C:C+8]} = _mm_packs_epi16(vout${ABC[C:C+8]}, vout${ABC[C:C+8]});
+          vout${ABC[C:C+8]}${ABC[C:C+8]} = _mm_max_epi8(vout${ABC[C:C+8]}${ABC[C:C+8]}, voutput_min);
+          vout${ABC[C:C+8]}${ABC[C:C+8]} = _mm_min_epi8(vout${ABC[C:C+8]}${ABC[C:C+8]}, voutput_max);
 
       $if CHANNEL_TILE > 8:
         _mm_storeu_si128((__m128i*) output, vout${ABC[0:16]});
@@ -155,8 +156,8 @@
           w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
           k += 4;
 
-        const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-        const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+        const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+        const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
         const __m128i vacc${ABC[1:4:2]} = _mm_shuffle_epi32(vacc${ABC[0:4]}, _MM_SHUFFLE(3, 3, 1, 1));
 
@@ -168,23 +169,21 @@
 
         const __m128i vq31prod${ABC[0:4]} = _mm_blend_epi16(vq31prod${ABC[0:4:2]}, vq31prod${ABC[1:4:2]}, 0xCC);
 
-        const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+        const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
         const __m128i vrem${ABC[0:4]} =
           _mm_add_epi32(_mm_and_si128(vq31prod${ABC[0:4]}, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod${ABC[0:4]}));
 
-        const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-        const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+        const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+        const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
         vacc${ABC[0:4]} =
           _mm_sub_epi32(_mm_sra_epi32(vq31prod${ABC[0:4]}, vshift), _mm_cmpgt_epi32(vrem${ABC[0:4]}, vremainder_threshold));
 
-        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+        const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
         __m128i vout${ABC[0:4]} = _mm_adds_epi16(_mm_packs_epi32(vacc${ABC[0:4]}, vacc${ABC[0:4]}), voutput_zero_point);
 
-        const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-        const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-        vout${ABC[0:4]} = _mm_min_epi16(_mm_max_epi16(vout${ABC[0:4]}, voutput_min), voutput_max);
-
         vout${ABC[0:4]} = _mm_packs_epi16(vout${ABC[0:4]}, vout${ABC[0:4]});
+        vout${ABC[0:4]} = _mm_max_epi8(vout${ABC[0:4]}, _mm_load_si128((const __m128i*) params->sse4.output_min));
+        vout${ABC[0:4]} = _mm_min_epi8(vout${ABC[0:4]}, _mm_load_si128((const __m128i*) params->sse4.output_max));
 
         $if CHANNEL_TILE > 4:
           if XNN_LIKELY(c >= 4) {
diff --git a/src/qs8-gemm/MRx16c8-avx512skx.c.in b/src/qs8-gemm/MRx16c8-avx512skx.c.in
index adcbe21..7870b6c 100644
--- a/src/qs8-gemm/MRx16c8-avx512skx.c.in
+++ b/src/qs8-gemm/MRx16c8-avx512skx.c.in
@@ -61,19 +61,24 @@
 
   const __mmask16 vbias_mask = _cvtu32_mask16(0x1111);
   const __mmask16 vblend_mask = _cvtu32_mask16(0xAAAA);
-  const __m512i vmultiplier = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse2.multiplier));
-  const __m512i vrounding = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse2.rounding));
-  const __m512i vremainder_mask = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse2.remainder_mask));
-  const __m512i vremainder_threshold = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse2.remainder_threshold));
-  const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+  const __m512i vmultiplier = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse4.multiplier));
+  const __m512i vrounding = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse4.rounding));
+  const __m512i vremainder_mask = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse4.remainder_mask));
+  const __m512i vremainder_threshold = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse4.remainder_threshold));
+  const __m128i vshift = _mm_load_si128((const __m128i*) params->sse4.shift);
   $if MR > 1:
-    const __m512i voutput_zero_point = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse2.output_zero_point));
-    const __m512i voutput_min = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse2.output_min));
-    const __m512i voutput_max = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse2.output_max));
+    const __m512i voutput_zero_point = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse4.output_zero_point));
   $else:
-    const __m256i voutput_zero_point = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_zero_point));
-    const __m256i voutput_min = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_min));
-    const __m256i voutput_max = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_max));
+    const __m256i voutput_zero_point = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse4.output_zero_point));
+  $if MR > 2:
+    const __m512i voutput_min = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse4.output_min));
+    const __m512i voutput_max = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse4.output_max));
+  $elif MR == 2:
+    const __m256i voutput_min = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse4.output_min));
+    const __m256i voutput_max = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse4.output_max));
+  $else:
+    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4.output_min);
+    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse4.output_max);
   do {
     __m512i vacc0x0123 = _mm512_maskz_expandloadu_epi32(vbias_mask, w);
     $for N in range(4, 16, 4):
@@ -147,26 +152,28 @@
         _mm512_mask_sub_epi32(vacc${M}x084C195D2A6E3B7F, _mm512_cmpgt_epi32_mask(vrem${M}x084C195D2A6E3B7F, vremainder_threshold), vacc${M}x084C195D2A6E3B7F, vminus_one);
 
     $if MR == 1:
-      __m256i vacc0x084C2A6E195D3B7F = _mm256_adds_epi16(_mm256_packs_epi32(_mm512_castsi512_si256(vacc0x084C195D2A6E3B7F), _mm512_extracti32x8_epi32(vacc0x084C195D2A6E3B7F, 1)), voutput_zero_point);
-      vacc0x084C2A6E195D3B7F = _mm256_min_epi16(_mm256_max_epi16(vacc0x084C2A6E195D3B7F, voutput_min), voutput_max);
+      const __m256i vacc0x084C2A6E195D3B7F = _mm256_adds_epi16(_mm256_packs_epi32(_mm512_castsi512_si256(vacc0x084C195D2A6E3B7F), _mm512_extracti32x8_epi32(vacc0x084C195D2A6E3B7F, 1)), voutput_zero_point);
     $else:
       $for M in range(0, MR, 2):
-        __m512i vacc${M}${min(M+1, MR-1)}x084Cx195Dx2A6Ex3B7F = _mm512_adds_epi16(_mm512_packs_epi32(vacc${M}x084C195D2A6E3B7F, vacc${min(M+1, MR-1)}x084C195D2A6E3B7F), voutput_zero_point);
-
-      $for M in range(0, MR, 2):
-        vacc${M}${min(M+1, MR-1)}x084Cx195Dx2A6Ex3B7F = _mm512_min_epi16(_mm512_max_epi16(vacc${M}${min(M+1, MR-1)}x084Cx195Dx2A6Ex3B7F, voutput_min), voutput_max);
+        const __m512i vacc${M}${min(M+1, MR-1)}x084Cx195Dx2A6Ex3B7F = _mm512_adds_epi16(_mm512_packs_epi32(vacc${M}x084C195D2A6E3B7F, vacc${min(M+1, MR-1)}x084C195D2A6E3B7F), voutput_zero_point);
 
     $if MR > 2:
       __m512i vout012${min(3, MR-1)}x084Cx195Dx2A6Ex3B7F = _mm512_packs_epi16(vacc01x084Cx195Dx2A6Ex3B7F, vacc2${min(3, MR-1)}x084Cx195Dx2A6Ex3B7F);
       vout012${min(M+3, MR-1)}x084Cx195Dx2A6Ex3B7F = _mm512_permutexvar_epi32(_mm512_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0), vout012${min(3, MR-1)}x084Cx195Dx2A6Ex3B7F);
-      const __m512i vout012${min(3, MR-1)}x0123456789ABCDEF = _mm512_shuffle_epi8(vout012${min(3, MR-1)}x084Cx195Dx2A6Ex3B7F, _mm512_set_epi8(15, 11, 7, 3, 13, 9, 5, 1, 14, 10, 6, 2, 12, 8, 4, 0, 15, 11, 7, 3, 13, 9, 5, 1, 14, 10, 6, 2, 12, 8, 4, 0, 15, 11, 7, 3, 13, 9, 5, 1, 14, 10, 6, 2, 12, 8, 4, 0, 15, 11, 7, 3, 13, 9, 5, 1, 14, 10, 6, 2, 12, 8, 4, 0));
+      __m512i vout012${min(3, MR-1)}x0123456789ABCDEF = _mm512_shuffle_epi8(vout012${min(3, MR-1)}x084Cx195Dx2A6Ex3B7F, _mm512_set_epi8(15, 11, 7, 3, 13, 9, 5, 1, 14, 10, 6, 2, 12, 8, 4, 0, 15, 11, 7, 3, 13, 9, 5, 1, 14, 10, 6, 2, 12, 8, 4, 0, 15, 11, 7, 3, 13, 9, 5, 1, 14, 10, 6, 2, 12, 8, 4, 0, 15, 11, 7, 3, 13, 9, 5, 1, 14, 10, 6, 2, 12, 8, 4, 0));
+      vout012${min(3, MR-1)}x0123456789ABCDEF = _mm512_max_epi8(vout012${min(3, MR-1)}x0123456789ABCDEF, voutput_min);
+      vout012${min(3, MR-1)}x0123456789ABCDEF = _mm512_min_epi8(vout012${min(3, MR-1)}x0123456789ABCDEF, voutput_max);
     $elif MR == 2:
       const __m256i vout01x084Cx2A6Ex195Dx3B7F = _mm256_packs_epi16(_mm512_castsi512_si256(vacc01x084Cx195Dx2A6Ex3B7F), _mm512_extracti32x8_epi32(vacc01x084Cx195Dx2A6Ex3B7F, 1));
       const __m256i vout01x084C2A6E195D3B7F = _mm256_permutevar8x32_epi32(vout01x084Cx2A6Ex195Dx3B7F, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
-      const __m256i vout01x0123456789ABCDEF = _mm256_shuffle_epi8(vout01x084C2A6E195D3B7F, _mm256_set_epi8(15, 7, 11, 3, 13, 5, 9, 1, 14, 6, 10, 2, 12, 4, 8, 0, 15, 7, 11, 3, 13, 5, 9, 1, 14, 6, 10, 2, 12, 4, 8, 0));
+      __m256i vout01x0123456789ABCDEF = _mm256_shuffle_epi8(vout01x084C2A6E195D3B7F, _mm256_set_epi8(15, 7, 11, 3, 13, 5, 9, 1, 14, 6, 10, 2, 12, 4, 8, 0, 15, 7, 11, 3, 13, 5, 9, 1, 14, 6, 10, 2, 12, 4, 8, 0));
+      vout01x0123456789ABCDEF = _mm256_max_epi8(vout01x0123456789ABCDEF, voutput_min);
+      vout01x0123456789ABCDEF = _mm256_min_epi8(vout01x0123456789ABCDEF, voutput_max);
     $elif MR == 1:
       const __m128i vout0x084C2A6E195D3B7F = _mm_packs_epi16(_mm256_castsi256_si128(vacc0x084C2A6E195D3B7F), _mm256_extracti128_si256(vacc0x084C2A6E195D3B7F, 1));
-      const __m128i vout0x0123456789ABCDEF = _mm_shuffle_epi8(vout0x084C2A6E195D3B7F, _mm_set_epi8(15, 7, 11, 3, 13, 5, 9, 1, 14, 6, 10, 2, 12, 4, 8, 0));
+      __m128i vout0x0123456789ABCDEF = _mm_shuffle_epi8(vout0x084C2A6E195D3B7F, _mm_set_epi8(15, 7, 11, 3, 13, 5, 9, 1, 14, 6, 10, 2, 12, 4, 8, 0));
+      vout0x0123456789ABCDEF = _mm_max_epi8(vout0x0123456789ABCDEF, voutput_min);
+      vout0x0123456789ABCDEF = _mm_min_epi8(vout0x0123456789ABCDEF, voutput_max);
 
     $if MR > 2:
       if (nc >= 16) {
diff --git a/src/qs8-gemm/MRx4c2-sse.c.in b/src/qs8-gemm/MRx4c2-sse.c.in
index 9e25af1..7669bfa 100644
--- a/src/qs8-gemm/MRx4c2-sse.c.in
+++ b/src/qs8-gemm/MRx4c2-sse.c.in
@@ -27,6 +27,7 @@
 
 $LOAD_SUFFIX = {"LD128": "_ld128", "LD64": "_ld64", "EXTENDED": ""}[VARIANT]
 $GEMM_SUFFIX = "_xw" if VARIANT == "EXTENDED" else ""
+$PARAMS_STRUCT = "sse4" if SSE >= 4 else "sse2"
 $ISA = "xop" if XOP else "avx" if AVX else {2: "sse2", 3: "ssse3", 4: "sse41"}[SSE]
 void xnn_qs8_gemm${GEMM_SUFFIX}_minmax_ukernel_${MR}x4c2__${ISA}${LOAD_SUFFIX}(
     size_t mr,
@@ -214,8 +215,8 @@
       }
     }
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.rounding);
 
     $if SSE == 4:
       $for M in range(MR):
@@ -277,31 +278,39 @@
       $for M in range(MR):
         const __m128i vq31prod${M}x0123 = _mm_shuffle_epi32(vq31prod${M}x0213, _MM_SHUFFLE(3, 1, 2, 0));
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.remainder_mask);
     $for M in range(MR):
       const __m128i vrem${M}x0123 =
         _mm_add_epi32(_mm_and_si128(vq31prod${M}x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod${M}x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.remainder_threshold);
+    $if M > 1:
+      const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->${PARAMS_STRUCT}.shift);
+    $else:
+      const __m128i vshift = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.shift);
     $for M in range(MR):
       vacc${M}x0123 =
         _mm_sub_epi32(_mm_sra_epi32(vq31prod${M}x0123, vshift), _mm_cmpgt_epi32(vrem${M}x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_zero_point);
     $for M in range(0, MR, 2):
       __m128i vacc${M}${min(M+1, MR-1)}x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc${M}x0123, vacc${min(M+1, MR-1)}x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    $for M in range(0, MR, 2):
-      vacc${M}${min(M+1, MR-1)}x0123 = _mm_min_epi16(_mm_max_epi16(vacc${M}${min(M+1, MR-1)}x0123, voutput_min), voutput_max);
+    $if SSE < 4:
+      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
+      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
+      $for M in range(0, MR, 2):
+        vacc${M}${min(M+1, MR-1)}x0123 = _mm_min_epi16(_mm_max_epi16(vacc${M}${min(M+1, MR-1)}x0123, voutput_min), voutput_max);
 
     $if MR > 2:
       __m128i vout = _mm_packs_epi16(vacc0${min(1, MR-1)}x0123, vacc${min(2, MR-1)}${min(3, MR-1)}x0123);
     $else:
       __m128i vout = _mm_packs_epi16(vacc0${min(1, MR-1)}x0123, vacc0${min(1, MR-1)}x0123);
 
+    $if SSE == 4:
+      vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+      vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       $for M in range(1, MR):
diff --git a/src/qs8-gemm/MRx4c8-sse.c.in b/src/qs8-gemm/MRx4c8-sse.c.in
index 46a0fff..0f2ed51 100644
--- a/src/qs8-gemm/MRx4c8-sse.c.in
+++ b/src/qs8-gemm/MRx4c8-sse.c.in
@@ -27,6 +27,7 @@
 
 $LOAD_SUFFIX = {"LD128": "_ld128", "LD64": "_ld64", "EXTENDED": ""}[VARIANT]
 $GEMM_SUFFIX = "_xw" if VARIANT == "EXTENDED" else ""
+$PARAMS_STRUCT = "sse4" if SSE >= 4 else "sse2"
 $ISA = "xop" if XOP else "avx" if AVX else {2: "sse2", 3: "ssse3", 4: "sse41"}[SSE]
 void xnn_qs8_gemm${GEMM_SUFFIX}_minmax_ukernel_${MR}x4c8__${ISA}${LOAD_SUFFIX}(
     size_t mr,
@@ -151,8 +152,8 @@
       $for M in range(MR):
         __m128i vacc${M}x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc${M}x02, vacc${M}x13), _mm_unpackhi_epi32(vacc${M}x02, vacc${M}x13));
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.rounding);
 
     $if SSE == 4:
       $for M in range(MR):
@@ -214,31 +215,39 @@
       $for M in range(MR):
         const __m128i vq31prod${M}x0123 = _mm_shuffle_epi32(vq31prod${M}x0213, _MM_SHUFFLE(3, 1, 2, 0));
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.remainder_mask);
     $for M in range(MR):
       const __m128i vrem${M}x0123 =
         _mm_add_epi32(_mm_and_si128(vq31prod${M}x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod${M}x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.remainder_threshold);
+    $if M > 1:
+      const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->${PARAMS_STRUCT}.shift);
+    $else:
+      const __m128i vshift = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.shift);
     $for M in range(MR):
       vacc${M}x0123 =
         _mm_sub_epi32(_mm_sra_epi32(vq31prod${M}x0123, vshift), _mm_cmpgt_epi32(vrem${M}x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_zero_point);
     $for M in range(0, MR, 2):
       __m128i vacc${M}${min(M+1, MR-1)}x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc${M}x0123, vacc${min(M+1, MR-1)}x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    $for M in range(0, MR, 2):
-      vacc${M}${min(M+1, MR-1)}x0123 = _mm_min_epi16(_mm_max_epi16(vacc${M}${min(M+1, MR-1)}x0123, voutput_min), voutput_max);
+    $if SSE < 4:
+      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_min);
+      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_max);
+      $for M in range(0, MR, 2):
+        vacc${M}${min(M+1, MR-1)}x0123 = _mm_min_epi16(_mm_max_epi16(vacc${M}${min(M+1, MR-1)}x0123, voutput_min), voutput_max);
 
     $if MR > 2:
       __m128i vout = _mm_packs_epi16(vacc0${min(1, MR-1)}x0123, vacc${min(2, MR-1)}${min(3, MR-1)}x0123);
     $else:
       __m128i vout = _mm_packs_epi16(vacc0${min(1, MR-1)}x0123, vacc0${min(1, MR-1)}x0123);
 
+    $if SSE == 4:
+      vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+      vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       $for M in range(1, MR):
diff --git a/src/qs8-gemm/MRx8c8-avx2.c.in b/src/qs8-gemm/MRx8c8-avx2.c.in
index 70e4d32..bac1ce2 100644
--- a/src/qs8-gemm/MRx8c8-avx2.c.in
+++ b/src/qs8-gemm/MRx8c8-avx2.c.in
@@ -112,11 +112,11 @@
     $for M in range(MR):
       __m256i vacc${M}x01234567 = _mm256_permutevar8x32_epi32(vacc${M}x02461357, vpermute_mask);
 
-    const __m256i vmultiplier = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.multiplier));
-    const __m256i vrounding = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.rounding));
+    const __m256i vmultiplier = _mm256_load_si256((const __m256i*) params->avx2.multiplier);
+    const __m256i vrounding = _mm256_load_si256((const __m256i*) params->avx2.rounding);
 
     $for M in range(MR):
-      const __m256i vacc${M}x11335577 = _mm256_shuffle_epi32(vacc${M}x01234567, _MM_SHUFFLE(3, 3, 1, 1));
+      const __m256i vacc${M}x11335577 = _mm256_srli_epi64(vacc${M}x01234567, 32);
 
     $for M in range(MR):
       const __m256i vprod${M}x0246 = _mm256_add_epi64(_mm256_mul_epi32(vacc${M}x01234567, vmultiplier), vrounding);
@@ -131,33 +131,35 @@
     $for M in range(MR):
       const __m256i vq31prod${M}x01234567 = _mm256_blend_epi16(vq31prod${M}x0246, vq31prod${M}x1357, 0xCC);
 
-    const __m256i vremainder_mask = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_mask));
+    const __m256i vremainder_mask = _mm256_load_si256((const __m256i*) params->avx2.remainder_mask);
     $for M in range(MR):
       const __m256i vrem${M}x01234567 =
         _mm256_add_epi32(_mm256_and_si256(vq31prod${M}x01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod${M}x01234567));
 
-    const __m256i vremainder_threshold = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_threshold));
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m256i vremainder_threshold = _mm256_load_si256((const __m256i*) params->avx2.remainder_threshold);
+    $if M > 1:
+      const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->avx2.shift);
+    $else:
+      const __m128i vshift = _mm_load_si128((const __m128i*) params->avx2.shift);
     $for M in range(MR):
       vacc${M}x01234567 =
         _mm256_sub_epi32(_mm256_sra_epi32(vq31prod${M}x01234567, vshift), _mm256_cmpgt_epi32(vrem${M}x01234567, vremainder_threshold));
 
-    const __m256i voutput_zero_point = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_zero_point));
+    const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx2.output_zero_point);
     $for M in range(0, MR, 2):
       __m256i vacc${M}${min(M+1, MR-1)}x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc${M}x01234567, vacc${min(M+1, MR-1)}x01234567), voutput_zero_point);
 
     $for M in range(0, MR, 2):
       vacc${M}${min(M+1, MR-1)}x01234567 = _mm256_permute4x64_epi64(vacc${M}${min(M+1, MR-1)}x01234567, _MM_SHUFFLE(3, 1, 2, 0));
 
-    const __m256i voutput_min = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_min));
-    const __m256i voutput_max = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_max));
-    $for M in range(0, MR, 2):
-      vacc${M}${min(M+1, MR-1)}x01234567 = _mm256_min_epi16(_mm256_max_epi16(vacc${M}${min(M+1, MR-1)}x01234567, voutput_min), voutput_max);
-
     $if MR > 2:
       __m256i vout = _mm256_packs_epi16(vacc0${min(1, MR-1)}x01234567, vacc${min(2, MR-1)}${min(3, MR-1)}x01234567);
     $else:
       __m256i vout = _mm256_packs_epi16(vacc0${min(1, MR-1)}x01234567, vacc0${min(1, MR-1)}x01234567);
+
+    vout = _mm256_max_epi8(vout, _mm256_load_si256((const __m256i*) params->avx2.output_min));
+    vout = _mm256_min_epi8(vout, _mm256_load_si256((const __m256i*) params->avx2.output_max));
+
     __m128i vout_lo = _mm256_castsi256_si128(vout);
     __m128i vout_hi = _mm256_extracti128_si256(vout, 1);
 
diff --git a/src/qs8-gemm/gen/1x16c8-minmax-avx512skx.c b/src/qs8-gemm/gen/1x16c8-minmax-avx512skx.c
index 412da85..7729f91 100644
--- a/src/qs8-gemm/gen/1x16c8-minmax-avx512skx.c
+++ b/src/qs8-gemm/gen/1x16c8-minmax-avx512skx.c
@@ -43,14 +43,14 @@
 
   const __mmask16 vbias_mask = _cvtu32_mask16(0x1111);
   const __mmask16 vblend_mask = _cvtu32_mask16(0xAAAA);
-  const __m512i vmultiplier = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse2.multiplier));
-  const __m512i vrounding = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse2.rounding));
-  const __m512i vremainder_mask = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse2.remainder_mask));
-  const __m512i vremainder_threshold = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse2.remainder_threshold));
-  const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
-  const __m256i voutput_zero_point = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_zero_point));
-  const __m256i voutput_min = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_min));
-  const __m256i voutput_max = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_max));
+  const __m512i vmultiplier = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse4.multiplier));
+  const __m512i vrounding = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse4.rounding));
+  const __m512i vremainder_mask = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse4.remainder_mask));
+  const __m512i vremainder_threshold = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse4.remainder_threshold));
+  const __m128i vshift = _mm_load_si128((const __m128i*) params->sse4.shift);
+  const __m256i voutput_zero_point = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse4.output_zero_point));
+  const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4.output_min);
+  const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse4.output_max);
   do {
     __m512i vacc0x0123 = _mm512_maskz_expandloadu_epi32(vbias_mask, w);
     __m512i vacc0x4567 = _mm512_maskz_expandloadu_epi32(vbias_mask, (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)));
@@ -105,11 +105,12 @@
     vacc0x084C195D2A6E3B7F =
       _mm512_mask_sub_epi32(vacc0x084C195D2A6E3B7F, _mm512_cmpgt_epi32_mask(vrem0x084C195D2A6E3B7F, vremainder_threshold), vacc0x084C195D2A6E3B7F, vminus_one);
 
-    __m256i vacc0x084C2A6E195D3B7F = _mm256_adds_epi16(_mm256_packs_epi32(_mm512_castsi512_si256(vacc0x084C195D2A6E3B7F), _mm512_extracti32x8_epi32(vacc0x084C195D2A6E3B7F, 1)), voutput_zero_point);
-    vacc0x084C2A6E195D3B7F = _mm256_min_epi16(_mm256_max_epi16(vacc0x084C2A6E195D3B7F, voutput_min), voutput_max);
+    const __m256i vacc0x084C2A6E195D3B7F = _mm256_adds_epi16(_mm256_packs_epi32(_mm512_castsi512_si256(vacc0x084C195D2A6E3B7F), _mm512_extracti32x8_epi32(vacc0x084C195D2A6E3B7F, 1)), voutput_zero_point);
 
     const __m128i vout0x084C2A6E195D3B7F = _mm_packs_epi16(_mm256_castsi256_si128(vacc0x084C2A6E195D3B7F), _mm256_extracti128_si256(vacc0x084C2A6E195D3B7F, 1));
-    const __m128i vout0x0123456789ABCDEF = _mm_shuffle_epi8(vout0x084C2A6E195D3B7F, _mm_set_epi8(15, 7, 11, 3, 13, 5, 9, 1, 14, 6, 10, 2, 12, 4, 8, 0));
+    __m128i vout0x0123456789ABCDEF = _mm_shuffle_epi8(vout0x084C2A6E195D3B7F, _mm_set_epi8(15, 7, 11, 3, 13, 5, 9, 1, 14, 6, 10, 2, 12, 4, 8, 0));
+    vout0x0123456789ABCDEF = _mm_max_epi8(vout0x0123456789ABCDEF, voutput_min);
+    vout0x0123456789ABCDEF = _mm_min_epi8(vout0x0123456789ABCDEF, voutput_max);
 
     if (nc >= 16) {
       _mm_storeu_si128((__m128i*) c0, vout0x0123456789ABCDEF);
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-avx-ld128.c b/src/qs8-gemm/gen/1x4c2-minmax-avx-ld128.c
index b02cf51..8350dc1 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-avx-ld128.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-avx-ld128.c
@@ -105,8 +105,8 @@
       }
     }
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
 
@@ -119,24 +119,24 @@
 
     const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc00x0123 = _mm_min_epi16(_mm_max_epi16(vacc00x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-avx-ld64.c b/src/qs8-gemm/gen/1x4c2-minmax-avx-ld64.c
index 29a3338..63362ee 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-avx-ld64.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-avx-ld64.c
@@ -105,8 +105,8 @@
       }
     }
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
 
@@ -119,24 +119,24 @@
 
     const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc00x0123 = _mm_min_epi16(_mm_max_epi16(vacc00x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-sse2-ld128.c b/src/qs8-gemm/gen/1x4c2-minmax-sse2-ld128.c
index f641df6..7df288a 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-sse2-ld128.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-sse2-ld128.c
@@ -153,6 +153,7 @@
 
     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
 
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-sse2-ld64.c b/src/qs8-gemm/gen/1x4c2-minmax-sse2-ld64.c
index b8af56f..d809fb1 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-sse2-ld64.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-sse2-ld64.c
@@ -153,6 +153,7 @@
 
     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
 
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-sse41-ld128.c b/src/qs8-gemm/gen/1x4c2-minmax-sse41-ld128.c
index 67226d8..6b57e90 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-sse41-ld128.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-sse41-ld128.c
@@ -105,8 +105,8 @@
       }
     }
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
 
@@ -119,24 +119,24 @@
 
     const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc00x0123 = _mm_min_epi16(_mm_max_epi16(vacc00x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-sse41-ld64.c b/src/qs8-gemm/gen/1x4c2-minmax-sse41-ld64.c
index 31bd480..5a75904 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-sse41-ld64.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-sse41-ld64.c
@@ -105,8 +105,8 @@
       }
     }
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
 
@@ -119,24 +119,24 @@
 
     const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc00x0123 = _mm_min_epi16(_mm_max_epi16(vacc00x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-ssse3-ld128.c b/src/qs8-gemm/gen/1x4c2-minmax-ssse3-ld128.c
index 064aab2..b3ea64b 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-ssse3-ld128.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-ssse3-ld128.c
@@ -153,6 +153,7 @@
 
     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
 
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-ssse3-ld64.c b/src/qs8-gemm/gen/1x4c2-minmax-ssse3-ld64.c
index 5065d6f..1401dc4 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-ssse3-ld64.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-ssse3-ld64.c
@@ -153,6 +153,7 @@
 
     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
 
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-xop-ld128.c b/src/qs8-gemm/gen/1x4c2-minmax-xop-ld128.c
index a208aeb..227d0ab 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-xop-ld128.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-xop-ld128.c
@@ -110,8 +110,8 @@
       }
     }
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
 
@@ -124,24 +124,24 @@
 
     const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc00x0123 = _mm_min_epi16(_mm_max_epi16(vacc00x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-xop-ld64.c b/src/qs8-gemm/gen/1x4c2-minmax-xop-ld64.c
index eba8477..1a3f597 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-xop-ld64.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-xop-ld64.c
@@ -110,8 +110,8 @@
       }
     }
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
 
@@ -124,24 +124,24 @@
 
     const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc00x0123 = _mm_min_epi16(_mm_max_epi16(vacc00x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
diff --git a/src/qs8-gemm/gen/1x4c2-xw-minmax-avx.c b/src/qs8-gemm/gen/1x4c2-xw-minmax-avx.c
index 48b7e65..4c04278 100644
--- a/src/qs8-gemm/gen/1x4c2-xw-minmax-avx.c
+++ b/src/qs8-gemm/gen/1x4c2-xw-minmax-avx.c
@@ -98,8 +98,8 @@
       }
     }
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
 
@@ -112,24 +112,24 @@
 
     const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc00x0123 = _mm_min_epi16(_mm_max_epi16(vacc00x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
diff --git a/src/qs8-gemm/gen/1x4c2-xw-minmax-sse2.c b/src/qs8-gemm/gen/1x4c2-xw-minmax-sse2.c
index 23fd99e..299d609 100644
--- a/src/qs8-gemm/gen/1x4c2-xw-minmax-sse2.c
+++ b/src/qs8-gemm/gen/1x4c2-xw-minmax-sse2.c
@@ -146,6 +146,7 @@
 
     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
 
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
diff --git a/src/qs8-gemm/gen/1x4c2-xw-minmax-sse41.c b/src/qs8-gemm/gen/1x4c2-xw-minmax-sse41.c
index 9ea9ad4..975d67c 100644
--- a/src/qs8-gemm/gen/1x4c2-xw-minmax-sse41.c
+++ b/src/qs8-gemm/gen/1x4c2-xw-minmax-sse41.c
@@ -98,8 +98,8 @@
       }
     }
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
 
@@ -112,24 +112,24 @@
 
     const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc00x0123 = _mm_min_epi16(_mm_max_epi16(vacc00x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
diff --git a/src/qs8-gemm/gen/1x4c2-xw-minmax-ssse3.c b/src/qs8-gemm/gen/1x4c2-xw-minmax-ssse3.c
index 6a9b744..1b56035 100644
--- a/src/qs8-gemm/gen/1x4c2-xw-minmax-ssse3.c
+++ b/src/qs8-gemm/gen/1x4c2-xw-minmax-ssse3.c
@@ -146,6 +146,7 @@
 
     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
 
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
diff --git a/src/qs8-gemm/gen/1x4c2-xw-minmax-xop.c b/src/qs8-gemm/gen/1x4c2-xw-minmax-xop.c
index b4d11ab..5565473 100644
--- a/src/qs8-gemm/gen/1x4c2-xw-minmax-xop.c
+++ b/src/qs8-gemm/gen/1x4c2-xw-minmax-xop.c
@@ -103,8 +103,8 @@
       }
     }
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
 
@@ -117,24 +117,24 @@
 
     const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc00x0123 = _mm_min_epi16(_mm_max_epi16(vacc00x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
diff --git a/src/qs8-gemm/gen/1x4c8-minmax-avx-ld128.c b/src/qs8-gemm/gen/1x4c8-minmax-avx-ld128.c
index 795a68a..9f3ee44 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-avx-ld128.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-avx-ld128.c
@@ -77,8 +77,8 @@
 
     __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
 
@@ -91,24 +91,24 @@
 
     const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc00x0123 = _mm_min_epi16(_mm_max_epi16(vacc00x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
diff --git a/src/qs8-gemm/gen/1x4c8-minmax-avx-ld64.c b/src/qs8-gemm/gen/1x4c8-minmax-avx-ld64.c
index 77d1188..d537eeb 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-avx-ld64.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-avx-ld64.c
@@ -79,8 +79,8 @@
 
     __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
 
@@ -93,24 +93,24 @@
 
     const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc00x0123 = _mm_min_epi16(_mm_max_epi16(vacc00x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
diff --git a/src/qs8-gemm/gen/1x4c8-minmax-sse2-ld128.c b/src/qs8-gemm/gen/1x4c8-minmax-sse2-ld128.c
index b8afcda..6370331 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-sse2-ld128.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-sse2-ld128.c
@@ -125,6 +125,7 @@
 
     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
 
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
diff --git a/src/qs8-gemm/gen/1x4c8-minmax-sse2-ld64.c b/src/qs8-gemm/gen/1x4c8-minmax-sse2-ld64.c
index 862a357..89adced 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-sse2-ld64.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-sse2-ld64.c
@@ -127,6 +127,7 @@
 
     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
 
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
diff --git a/src/qs8-gemm/gen/1x4c8-minmax-sse41-ld128.c b/src/qs8-gemm/gen/1x4c8-minmax-sse41-ld128.c
index 49213c7..a68701d 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-sse41-ld128.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-sse41-ld128.c
@@ -77,8 +77,8 @@
 
     __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
 
@@ -91,24 +91,24 @@
 
     const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc00x0123 = _mm_min_epi16(_mm_max_epi16(vacc00x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
diff --git a/src/qs8-gemm/gen/1x4c8-minmax-sse41-ld64.c b/src/qs8-gemm/gen/1x4c8-minmax-sse41-ld64.c
index b137d9d..17a890e 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-sse41-ld64.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-sse41-ld64.c
@@ -79,8 +79,8 @@
 
     __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
 
@@ -93,24 +93,24 @@
 
     const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc00x0123 = _mm_min_epi16(_mm_max_epi16(vacc00x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
diff --git a/src/qs8-gemm/gen/1x4c8-minmax-ssse3-ld128.c b/src/qs8-gemm/gen/1x4c8-minmax-ssse3-ld128.c
index a429fef..33104b1 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-ssse3-ld128.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-ssse3-ld128.c
@@ -125,6 +125,7 @@
 
     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
 
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
diff --git a/src/qs8-gemm/gen/1x4c8-minmax-ssse3-ld64.c b/src/qs8-gemm/gen/1x4c8-minmax-ssse3-ld64.c
index f3f96c5..535b406 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-ssse3-ld64.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-ssse3-ld64.c
@@ -127,6 +127,7 @@
 
     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
 
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
diff --git a/src/qs8-gemm/gen/1x4c8-minmax-xop-ld128.c b/src/qs8-gemm/gen/1x4c8-minmax-xop-ld128.c
index 169e070..34e3cb0 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-xop-ld128.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-xop-ld128.c
@@ -82,8 +82,8 @@
 
     __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
 
@@ -96,24 +96,24 @@
 
     const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc00x0123 = _mm_min_epi16(_mm_max_epi16(vacc00x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
diff --git a/src/qs8-gemm/gen/1x4c8-minmax-xop-ld64.c b/src/qs8-gemm/gen/1x4c8-minmax-xop-ld64.c
index cfbd035..ebcc4aa 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-xop-ld64.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-xop-ld64.c
@@ -84,8 +84,8 @@
 
     __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
 
@@ -98,24 +98,24 @@
 
     const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc00x0123 = _mm_min_epi16(_mm_max_epi16(vacc00x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
diff --git a/src/qs8-gemm/gen/1x4c8-xw-minmax-avx.c b/src/qs8-gemm/gen/1x4c8-xw-minmax-avx.c
index 29b9b6b..6d4827c 100644
--- a/src/qs8-gemm/gen/1x4c8-xw-minmax-avx.c
+++ b/src/qs8-gemm/gen/1x4c8-xw-minmax-avx.c
@@ -75,8 +75,8 @@
 
     __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
 
@@ -89,24 +89,24 @@
 
     const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc00x0123 = _mm_min_epi16(_mm_max_epi16(vacc00x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
diff --git a/src/qs8-gemm/gen/1x4c8-xw-minmax-sse2.c b/src/qs8-gemm/gen/1x4c8-xw-minmax-sse2.c
index 177f057..c5da5d5 100644
--- a/src/qs8-gemm/gen/1x4c8-xw-minmax-sse2.c
+++ b/src/qs8-gemm/gen/1x4c8-xw-minmax-sse2.c
@@ -123,6 +123,7 @@
 
     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
 
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
diff --git a/src/qs8-gemm/gen/1x4c8-xw-minmax-sse41.c b/src/qs8-gemm/gen/1x4c8-xw-minmax-sse41.c
index 2ce36cb..c0ee9fd 100644
--- a/src/qs8-gemm/gen/1x4c8-xw-minmax-sse41.c
+++ b/src/qs8-gemm/gen/1x4c8-xw-minmax-sse41.c
@@ -75,8 +75,8 @@
 
     __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
 
@@ -89,24 +89,24 @@
 
     const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc00x0123 = _mm_min_epi16(_mm_max_epi16(vacc00x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
diff --git a/src/qs8-gemm/gen/1x4c8-xw-minmax-ssse3.c b/src/qs8-gemm/gen/1x4c8-xw-minmax-ssse3.c
index 57abe7a..9967e0f 100644
--- a/src/qs8-gemm/gen/1x4c8-xw-minmax-ssse3.c
+++ b/src/qs8-gemm/gen/1x4c8-xw-minmax-ssse3.c
@@ -123,6 +123,7 @@
 
     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
 
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
diff --git a/src/qs8-gemm/gen/1x4c8-xw-minmax-xop.c b/src/qs8-gemm/gen/1x4c8-xw-minmax-xop.c
index 8df261f..44bc8cc 100644
--- a/src/qs8-gemm/gen/1x4c8-xw-minmax-xop.c
+++ b/src/qs8-gemm/gen/1x4c8-xw-minmax-xop.c
@@ -80,8 +80,8 @@
 
     __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
 
@@ -94,24 +94,24 @@
 
     const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc00x0123 = _mm_min_epi16(_mm_max_epi16(vacc00x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
diff --git a/src/qs8-gemm/gen/1x8c8-minmax-avx2.c b/src/qs8-gemm/gen/1x8c8-minmax-avx2.c
index d41d196..5f0a77c 100644
--- a/src/qs8-gemm/gen/1x8c8-minmax-avx2.c
+++ b/src/qs8-gemm/gen/1x8c8-minmax-avx2.c
@@ -91,10 +91,10 @@
     const __m256i vpermute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
     __m256i vacc0x01234567 = _mm256_permutevar8x32_epi32(vacc0x02461357, vpermute_mask);
 
-    const __m256i vmultiplier = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.multiplier));
-    const __m256i vrounding = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.rounding));
+    const __m256i vmultiplier = _mm256_load_si256((const __m256i*) params->avx2.multiplier);
+    const __m256i vrounding = _mm256_load_si256((const __m256i*) params->avx2.rounding);
 
-    const __m256i vacc0x11335577 = _mm256_shuffle_epi32(vacc0x01234567, _MM_SHUFFLE(3, 3, 1, 1));
+    const __m256i vacc0x11335577 = _mm256_srli_epi64(vacc0x01234567, 32);
 
     const __m256i vprod0x0246 = _mm256_add_epi64(_mm256_mul_epi32(vacc0x01234567, vmultiplier), vrounding);
 
@@ -105,25 +105,25 @@
 
     const __m256i vq31prod0x01234567 = _mm256_blend_epi16(vq31prod0x0246, vq31prod0x1357, 0xCC);
 
-    const __m256i vremainder_mask = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_mask));
+    const __m256i vremainder_mask = _mm256_load_si256((const __m256i*) params->avx2.remainder_mask);
     const __m256i vrem0x01234567 =
       _mm256_add_epi32(_mm256_and_si256(vq31prod0x01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod0x01234567));
 
-    const __m256i vremainder_threshold = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_threshold));
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m256i vremainder_threshold = _mm256_load_si256((const __m256i*) params->avx2.remainder_threshold);
+    const __m128i vshift = _mm_load_si128((const __m128i*) params->avx2.shift);
     vacc0x01234567 =
       _mm256_sub_epi32(_mm256_sra_epi32(vq31prod0x01234567, vshift), _mm256_cmpgt_epi32(vrem0x01234567, vremainder_threshold));
 
-    const __m256i voutput_zero_point = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_zero_point));
+    const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx2.output_zero_point);
     __m256i vacc00x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc0x01234567, vacc0x01234567), voutput_zero_point);
 
     vacc00x01234567 = _mm256_permute4x64_epi64(vacc00x01234567, _MM_SHUFFLE(3, 1, 2, 0));
 
-    const __m256i voutput_min = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_min));
-    const __m256i voutput_max = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_max));
-    vacc00x01234567 = _mm256_min_epi16(_mm256_max_epi16(vacc00x01234567, voutput_min), voutput_max);
-
     __m256i vout = _mm256_packs_epi16(vacc00x01234567, vacc00x01234567);
+
+    vout = _mm256_max_epi8(vout, _mm256_load_si256((const __m256i*) params->avx2.output_min));
+    vout = _mm256_min_epi8(vout, _mm256_load_si256((const __m256i*) params->avx2.output_max));
+
     __m128i vout_lo = _mm256_castsi256_si128(vout);
     __m128i vout_hi = _mm256_extracti128_si256(vout, 1);
 
diff --git a/src/qs8-gemm/gen/1x8c8-xw-minmax-avx2.c b/src/qs8-gemm/gen/1x8c8-xw-minmax-avx2.c
index 74a33fe..69b6b7c 100644
--- a/src/qs8-gemm/gen/1x8c8-xw-minmax-avx2.c
+++ b/src/qs8-gemm/gen/1x8c8-xw-minmax-avx2.c
@@ -87,10 +87,10 @@
     const __m256i vpermute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
     __m256i vacc0x01234567 = _mm256_permutevar8x32_epi32(vacc0x02461357, vpermute_mask);
 
-    const __m256i vmultiplier = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.multiplier));
-    const __m256i vrounding = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.rounding));
+    const __m256i vmultiplier = _mm256_load_si256((const __m256i*) params->avx2.multiplier);
+    const __m256i vrounding = _mm256_load_si256((const __m256i*) params->avx2.rounding);
 
-    const __m256i vacc0x11335577 = _mm256_shuffle_epi32(vacc0x01234567, _MM_SHUFFLE(3, 3, 1, 1));
+    const __m256i vacc0x11335577 = _mm256_srli_epi64(vacc0x01234567, 32);
 
     const __m256i vprod0x0246 = _mm256_add_epi64(_mm256_mul_epi32(vacc0x01234567, vmultiplier), vrounding);
 
@@ -101,25 +101,25 @@
 
     const __m256i vq31prod0x01234567 = _mm256_blend_epi16(vq31prod0x0246, vq31prod0x1357, 0xCC);
 
-    const __m256i vremainder_mask = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_mask));
+    const __m256i vremainder_mask = _mm256_load_si256((const __m256i*) params->avx2.remainder_mask);
     const __m256i vrem0x01234567 =
       _mm256_add_epi32(_mm256_and_si256(vq31prod0x01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod0x01234567));
 
-    const __m256i vremainder_threshold = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_threshold));
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m256i vremainder_threshold = _mm256_load_si256((const __m256i*) params->avx2.remainder_threshold);
+    const __m128i vshift = _mm_load_si128((const __m128i*) params->avx2.shift);
     vacc0x01234567 =
       _mm256_sub_epi32(_mm256_sra_epi32(vq31prod0x01234567, vshift), _mm256_cmpgt_epi32(vrem0x01234567, vremainder_threshold));
 
-    const __m256i voutput_zero_point = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_zero_point));
+    const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx2.output_zero_point);
     __m256i vacc00x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc0x01234567, vacc0x01234567), voutput_zero_point);
 
     vacc00x01234567 = _mm256_permute4x64_epi64(vacc00x01234567, _MM_SHUFFLE(3, 1, 2, 0));
 
-    const __m256i voutput_min = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_min));
-    const __m256i voutput_max = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_max));
-    vacc00x01234567 = _mm256_min_epi16(_mm256_max_epi16(vacc00x01234567, voutput_min), voutput_max);
-
     __m256i vout = _mm256_packs_epi16(vacc00x01234567, vacc00x01234567);
+
+    vout = _mm256_max_epi8(vout, _mm256_load_si256((const __m256i*) params->avx2.output_min));
+    vout = _mm256_min_epi8(vout, _mm256_load_si256((const __m256i*) params->avx2.output_max));
+
     __m128i vout_lo = _mm256_castsi256_si128(vout);
     __m128i vout_hi = _mm256_extracti128_si256(vout, 1);
 
diff --git a/src/qs8-gemm/gen/2x16c8-minmax-avx512skx.c b/src/qs8-gemm/gen/2x16c8-minmax-avx512skx.c
index 8b23c20..cab5fd0 100644
--- a/src/qs8-gemm/gen/2x16c8-minmax-avx512skx.c
+++ b/src/qs8-gemm/gen/2x16c8-minmax-avx512skx.c
@@ -49,14 +49,14 @@
 
   const __mmask16 vbias_mask = _cvtu32_mask16(0x1111);
   const __mmask16 vblend_mask = _cvtu32_mask16(0xAAAA);
-  const __m512i vmultiplier = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse2.multiplier));
-  const __m512i vrounding = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse2.rounding));
-  const __m512i vremainder_mask = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse2.remainder_mask));
-  const __m512i vremainder_threshold = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse2.remainder_threshold));
-  const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
-  const __m512i voutput_zero_point = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse2.output_zero_point));
-  const __m512i voutput_min = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse2.output_min));
-  const __m512i voutput_max = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse2.output_max));
+  const __m512i vmultiplier = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse4.multiplier));
+  const __m512i vrounding = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse4.rounding));
+  const __m512i vremainder_mask = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse4.remainder_mask));
+  const __m512i vremainder_threshold = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse4.remainder_threshold));
+  const __m128i vshift = _mm_load_si128((const __m128i*) params->sse4.shift);
+  const __m512i voutput_zero_point = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse4.output_zero_point));
+  const __m256i voutput_min = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse4.output_min));
+  const __m256i voutput_max = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse4.output_max));
   do {
     __m512i vacc0x0123 = _mm512_maskz_expandloadu_epi32(vbias_mask, w);
     __m512i vacc0x4567 = _mm512_maskz_expandloadu_epi32(vbias_mask, (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)));
@@ -135,13 +135,13 @@
     vacc1x084C195D2A6E3B7F =
       _mm512_mask_sub_epi32(vacc1x084C195D2A6E3B7F, _mm512_cmpgt_epi32_mask(vrem1x084C195D2A6E3B7F, vremainder_threshold), vacc1x084C195D2A6E3B7F, vminus_one);
 
-    __m512i vacc01x084Cx195Dx2A6Ex3B7F = _mm512_adds_epi16(_mm512_packs_epi32(vacc0x084C195D2A6E3B7F, vacc1x084C195D2A6E3B7F), voutput_zero_point);
-
-    vacc01x084Cx195Dx2A6Ex3B7F = _mm512_min_epi16(_mm512_max_epi16(vacc01x084Cx195Dx2A6Ex3B7F, voutput_min), voutput_max);
+    const __m512i vacc01x084Cx195Dx2A6Ex3B7F = _mm512_adds_epi16(_mm512_packs_epi32(vacc0x084C195D2A6E3B7F, vacc1x084C195D2A6E3B7F), voutput_zero_point);
 
     const __m256i vout01x084Cx2A6Ex195Dx3B7F = _mm256_packs_epi16(_mm512_castsi512_si256(vacc01x084Cx195Dx2A6Ex3B7F), _mm512_extracti32x8_epi32(vacc01x084Cx195Dx2A6Ex3B7F, 1));
     const __m256i vout01x084C2A6E195D3B7F = _mm256_permutevar8x32_epi32(vout01x084Cx2A6Ex195Dx3B7F, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
-    const __m256i vout01x0123456789ABCDEF = _mm256_shuffle_epi8(vout01x084C2A6E195D3B7F, _mm256_set_epi8(15, 7, 11, 3, 13, 5, 9, 1, 14, 6, 10, 2, 12, 4, 8, 0, 15, 7, 11, 3, 13, 5, 9, 1, 14, 6, 10, 2, 12, 4, 8, 0));
+    __m256i vout01x0123456789ABCDEF = _mm256_shuffle_epi8(vout01x084C2A6E195D3B7F, _mm256_set_epi8(15, 7, 11, 3, 13, 5, 9, 1, 14, 6, 10, 2, 12, 4, 8, 0, 15, 7, 11, 3, 13, 5, 9, 1, 14, 6, 10, 2, 12, 4, 8, 0));
+    vout01x0123456789ABCDEF = _mm256_max_epi8(vout01x0123456789ABCDEF, voutput_min);
+    vout01x0123456789ABCDEF = _mm256_min_epi8(vout01x0123456789ABCDEF, voutput_max);
 
     if (nc >= 16) {
       _mm_storeu_si128((__m128i*) c0, _mm256_castsi256_si128(vout01x0123456789ABCDEF));
diff --git a/src/qs8-gemm/gen/2x4c2-minmax-avx-ld128.c b/src/qs8-gemm/gen/2x4c2-minmax-avx-ld128.c
index 681db21..a6bb8cf 100644
--- a/src/qs8-gemm/gen/2x4c2-minmax-avx-ld128.c
+++ b/src/qs8-gemm/gen/2x4c2-minmax-avx-ld128.c
@@ -132,8 +132,8 @@
       }
     }
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -152,28 +152,28 @@
     const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
     const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
diff --git a/src/qs8-gemm/gen/2x4c2-minmax-avx-ld64.c b/src/qs8-gemm/gen/2x4c2-minmax-avx-ld64.c
index 7c738c1..2b72822 100644
--- a/src/qs8-gemm/gen/2x4c2-minmax-avx-ld64.c
+++ b/src/qs8-gemm/gen/2x4c2-minmax-avx-ld64.c
@@ -132,8 +132,8 @@
       }
     }
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -152,28 +152,28 @@
     const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
     const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
diff --git a/src/qs8-gemm/gen/2x4c2-minmax-sse2-ld128.c b/src/qs8-gemm/gen/2x4c2-minmax-sse2-ld128.c
index c6b83cc..c4ade3b 100644
--- a/src/qs8-gemm/gen/2x4c2-minmax-sse2-ld128.c
+++ b/src/qs8-gemm/gen/2x4c2-minmax-sse2-ld128.c
@@ -198,6 +198,7 @@
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
 
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       vout = _mm_srli_si128(vout, 4);
diff --git a/src/qs8-gemm/gen/2x4c2-minmax-sse2-ld64.c b/src/qs8-gemm/gen/2x4c2-minmax-sse2-ld64.c
index 8abfa22..672ce4a 100644
--- a/src/qs8-gemm/gen/2x4c2-minmax-sse2-ld64.c
+++ b/src/qs8-gemm/gen/2x4c2-minmax-sse2-ld64.c
@@ -198,6 +198,7 @@
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
 
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       vout = _mm_srli_si128(vout, 4);
diff --git a/src/qs8-gemm/gen/2x4c2-minmax-sse41-ld128.c b/src/qs8-gemm/gen/2x4c2-minmax-sse41-ld128.c
index 3decb45..60a7aa1 100644
--- a/src/qs8-gemm/gen/2x4c2-minmax-sse41-ld128.c
+++ b/src/qs8-gemm/gen/2x4c2-minmax-sse41-ld128.c
@@ -132,8 +132,8 @@
       }
     }
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -152,28 +152,28 @@
     const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
     const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
diff --git a/src/qs8-gemm/gen/2x4c2-minmax-sse41-ld64.c b/src/qs8-gemm/gen/2x4c2-minmax-sse41-ld64.c
index dbadda0..7f5ba39 100644
--- a/src/qs8-gemm/gen/2x4c2-minmax-sse41-ld64.c
+++ b/src/qs8-gemm/gen/2x4c2-minmax-sse41-ld64.c
@@ -132,8 +132,8 @@
       }
     }
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -152,28 +152,28 @@
     const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
     const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
diff --git a/src/qs8-gemm/gen/2x4c2-minmax-ssse3-ld128.c b/src/qs8-gemm/gen/2x4c2-minmax-ssse3-ld128.c
index 8250c5e..53bf77c 100644
--- a/src/qs8-gemm/gen/2x4c2-minmax-ssse3-ld128.c
+++ b/src/qs8-gemm/gen/2x4c2-minmax-ssse3-ld128.c
@@ -198,6 +198,7 @@
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
 
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       vout = _mm_srli_si128(vout, 4);
diff --git a/src/qs8-gemm/gen/2x4c2-minmax-ssse3-ld64.c b/src/qs8-gemm/gen/2x4c2-minmax-ssse3-ld64.c
index e645259..c2aec7c 100644
--- a/src/qs8-gemm/gen/2x4c2-minmax-ssse3-ld64.c
+++ b/src/qs8-gemm/gen/2x4c2-minmax-ssse3-ld64.c
@@ -198,6 +198,7 @@
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
 
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       vout = _mm_srli_si128(vout, 4);
diff --git a/src/qs8-gemm/gen/2x4c2-minmax-xop-ld128.c b/src/qs8-gemm/gen/2x4c2-minmax-xop-ld128.c
index c5e92c0..6266b33 100644
--- a/src/qs8-gemm/gen/2x4c2-minmax-xop-ld128.c
+++ b/src/qs8-gemm/gen/2x4c2-minmax-xop-ld128.c
@@ -137,8 +137,8 @@
       }
     }
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -157,28 +157,28 @@
     const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
     const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
diff --git a/src/qs8-gemm/gen/2x4c2-minmax-xop-ld64.c b/src/qs8-gemm/gen/2x4c2-minmax-xop-ld64.c
index 37ebc2c..a4f12a4 100644
--- a/src/qs8-gemm/gen/2x4c2-minmax-xop-ld64.c
+++ b/src/qs8-gemm/gen/2x4c2-minmax-xop-ld64.c
@@ -137,8 +137,8 @@
       }
     }
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -157,28 +157,28 @@
     const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
     const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
diff --git a/src/qs8-gemm/gen/2x4c2-xw-minmax-avx.c b/src/qs8-gemm/gen/2x4c2-xw-minmax-avx.c
index fff7970..dad6b55 100644
--- a/src/qs8-gemm/gen/2x4c2-xw-minmax-avx.c
+++ b/src/qs8-gemm/gen/2x4c2-xw-minmax-avx.c
@@ -125,8 +125,8 @@
       }
     }
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -145,28 +145,28 @@
     const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
     const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
diff --git a/src/qs8-gemm/gen/2x4c2-xw-minmax-sse2.c b/src/qs8-gemm/gen/2x4c2-xw-minmax-sse2.c
index f4283b2..4beb018 100644
--- a/src/qs8-gemm/gen/2x4c2-xw-minmax-sse2.c
+++ b/src/qs8-gemm/gen/2x4c2-xw-minmax-sse2.c
@@ -191,6 +191,7 @@
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
 
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       vout = _mm_srli_si128(vout, 4);
diff --git a/src/qs8-gemm/gen/2x4c2-xw-minmax-sse41.c b/src/qs8-gemm/gen/2x4c2-xw-minmax-sse41.c
index 3d3015d..a1575e9 100644
--- a/src/qs8-gemm/gen/2x4c2-xw-minmax-sse41.c
+++ b/src/qs8-gemm/gen/2x4c2-xw-minmax-sse41.c
@@ -125,8 +125,8 @@
       }
     }
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -145,28 +145,28 @@
     const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
     const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
diff --git a/src/qs8-gemm/gen/2x4c2-xw-minmax-ssse3.c b/src/qs8-gemm/gen/2x4c2-xw-minmax-ssse3.c
index cd0e181..d51e490 100644
--- a/src/qs8-gemm/gen/2x4c2-xw-minmax-ssse3.c
+++ b/src/qs8-gemm/gen/2x4c2-xw-minmax-ssse3.c
@@ -191,6 +191,7 @@
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
 
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       vout = _mm_srli_si128(vout, 4);
diff --git a/src/qs8-gemm/gen/2x4c2-xw-minmax-xop.c b/src/qs8-gemm/gen/2x4c2-xw-minmax-xop.c
index 5138b63..19dcda2 100644
--- a/src/qs8-gemm/gen/2x4c2-xw-minmax-xop.c
+++ b/src/qs8-gemm/gen/2x4c2-xw-minmax-xop.c
@@ -130,8 +130,8 @@
       }
     }
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -150,28 +150,28 @@
     const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
     const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
diff --git a/src/qs8-gemm/gen/2x4c8-minmax-avx-ld128.c b/src/qs8-gemm/gen/2x4c8-minmax-avx-ld128.c
index 2fc78af..fc48e65 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-avx-ld128.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-avx-ld128.c
@@ -97,8 +97,8 @@
     __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
     __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -117,28 +117,28 @@
     const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
     const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
diff --git a/src/qs8-gemm/gen/2x4c8-minmax-avx-ld64.c b/src/qs8-gemm/gen/2x4c8-minmax-avx-ld64.c
index 60896cc..713a705 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-avx-ld64.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-avx-ld64.c
@@ -99,8 +99,8 @@
     __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
     __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -119,28 +119,28 @@
     const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
     const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
diff --git a/src/qs8-gemm/gen/2x4c8-minmax-sse2-ld128.c b/src/qs8-gemm/gen/2x4c8-minmax-sse2-ld128.c
index fdb93c7..978e912 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-sse2-ld128.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-sse2-ld128.c
@@ -163,6 +163,7 @@
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
 
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       vout = _mm_srli_si128(vout, 4);
diff --git a/src/qs8-gemm/gen/2x4c8-minmax-sse2-ld64.c b/src/qs8-gemm/gen/2x4c8-minmax-sse2-ld64.c
index d7678a3..8d7725e 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-sse2-ld64.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-sse2-ld64.c
@@ -165,6 +165,7 @@
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
 
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       vout = _mm_srli_si128(vout, 4);
diff --git a/src/qs8-gemm/gen/2x4c8-minmax-sse41-ld128.c b/src/qs8-gemm/gen/2x4c8-minmax-sse41-ld128.c
index dde7a53..6236921 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-sse41-ld128.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-sse41-ld128.c
@@ -97,8 +97,8 @@
     __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
     __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -117,28 +117,28 @@
     const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
     const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
diff --git a/src/qs8-gemm/gen/2x4c8-minmax-sse41-ld64.c b/src/qs8-gemm/gen/2x4c8-minmax-sse41-ld64.c
index e5c3e40..d9db58d 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-sse41-ld64.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-sse41-ld64.c
@@ -99,8 +99,8 @@
     __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
     __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -119,28 +119,28 @@
     const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
     const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
diff --git a/src/qs8-gemm/gen/2x4c8-minmax-ssse3-ld128.c b/src/qs8-gemm/gen/2x4c8-minmax-ssse3-ld128.c
index b319c41..26c49c9 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-ssse3-ld128.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-ssse3-ld128.c
@@ -163,6 +163,7 @@
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
 
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       vout = _mm_srli_si128(vout, 4);
diff --git a/src/qs8-gemm/gen/2x4c8-minmax-ssse3-ld64.c b/src/qs8-gemm/gen/2x4c8-minmax-ssse3-ld64.c
index 4ba4dde..163a9f8 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-ssse3-ld64.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-ssse3-ld64.c
@@ -165,6 +165,7 @@
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
 
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       vout = _mm_srli_si128(vout, 4);
diff --git a/src/qs8-gemm/gen/2x4c8-minmax-xop-ld128.c b/src/qs8-gemm/gen/2x4c8-minmax-xop-ld128.c
index 4e303c0..8f2ec5e 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-xop-ld128.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-xop-ld128.c
@@ -102,8 +102,8 @@
     __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
     __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -122,28 +122,28 @@
     const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
     const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
diff --git a/src/qs8-gemm/gen/2x4c8-minmax-xop-ld64.c b/src/qs8-gemm/gen/2x4c8-minmax-xop-ld64.c
index c430eaf..16bbdcc 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-xop-ld64.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-xop-ld64.c
@@ -104,8 +104,8 @@
     __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
     __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -124,28 +124,28 @@
     const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
     const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
diff --git a/src/qs8-gemm/gen/2x4c8-xw-minmax-avx.c b/src/qs8-gemm/gen/2x4c8-xw-minmax-avx.c
index 59961d9..3a1a964 100644
--- a/src/qs8-gemm/gen/2x4c8-xw-minmax-avx.c
+++ b/src/qs8-gemm/gen/2x4c8-xw-minmax-avx.c
@@ -95,8 +95,8 @@
     __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
     __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -115,28 +115,28 @@
     const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
     const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
diff --git a/src/qs8-gemm/gen/2x4c8-xw-minmax-sse2.c b/src/qs8-gemm/gen/2x4c8-xw-minmax-sse2.c
index 2c1ada6..7764dd8 100644
--- a/src/qs8-gemm/gen/2x4c8-xw-minmax-sse2.c
+++ b/src/qs8-gemm/gen/2x4c8-xw-minmax-sse2.c
@@ -161,6 +161,7 @@
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
 
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       vout = _mm_srli_si128(vout, 4);
diff --git a/src/qs8-gemm/gen/2x4c8-xw-minmax-sse41.c b/src/qs8-gemm/gen/2x4c8-xw-minmax-sse41.c
index febf10b..1903f6b 100644
--- a/src/qs8-gemm/gen/2x4c8-xw-minmax-sse41.c
+++ b/src/qs8-gemm/gen/2x4c8-xw-minmax-sse41.c
@@ -95,8 +95,8 @@
     __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
     __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -115,28 +115,28 @@
     const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
     const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
diff --git a/src/qs8-gemm/gen/2x4c8-xw-minmax-ssse3.c b/src/qs8-gemm/gen/2x4c8-xw-minmax-ssse3.c
index 762b37d..bde1b65 100644
--- a/src/qs8-gemm/gen/2x4c8-xw-minmax-ssse3.c
+++ b/src/qs8-gemm/gen/2x4c8-xw-minmax-ssse3.c
@@ -161,6 +161,7 @@
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
 
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       vout = _mm_srli_si128(vout, 4);
diff --git a/src/qs8-gemm/gen/2x4c8-xw-minmax-xop.c b/src/qs8-gemm/gen/2x4c8-xw-minmax-xop.c
index 390558f..b15a8f9 100644
--- a/src/qs8-gemm/gen/2x4c8-xw-minmax-xop.c
+++ b/src/qs8-gemm/gen/2x4c8-xw-minmax-xop.c
@@ -100,8 +100,8 @@
     __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
     __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -120,28 +120,28 @@
     const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
     const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
diff --git a/src/qs8-gemm/gen/2x8c8-minmax-avx2.c b/src/qs8-gemm/gen/2x8c8-minmax-avx2.c
index 736f0fd..be11da6 100644
--- a/src/qs8-gemm/gen/2x8c8-minmax-avx2.c
+++ b/src/qs8-gemm/gen/2x8c8-minmax-avx2.c
@@ -112,11 +112,11 @@
     __m256i vacc0x01234567 = _mm256_permutevar8x32_epi32(vacc0x02461357, vpermute_mask);
     __m256i vacc1x01234567 = _mm256_permutevar8x32_epi32(vacc1x02461357, vpermute_mask);
 
-    const __m256i vmultiplier = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.multiplier));
-    const __m256i vrounding = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.rounding));
+    const __m256i vmultiplier = _mm256_load_si256((const __m256i*) params->avx2.multiplier);
+    const __m256i vrounding = _mm256_load_si256((const __m256i*) params->avx2.rounding);
 
-    const __m256i vacc0x11335577 = _mm256_shuffle_epi32(vacc0x01234567, _MM_SHUFFLE(3, 3, 1, 1));
-    const __m256i vacc1x11335577 = _mm256_shuffle_epi32(vacc1x01234567, _MM_SHUFFLE(3, 3, 1, 1));
+    const __m256i vacc0x11335577 = _mm256_srli_epi64(vacc0x01234567, 32);
+    const __m256i vacc1x11335577 = _mm256_srli_epi64(vacc1x01234567, 32);
 
     const __m256i vprod0x0246 = _mm256_add_epi64(_mm256_mul_epi32(vacc0x01234567, vmultiplier), vrounding);
     const __m256i vprod1x0246 = _mm256_add_epi64(_mm256_mul_epi32(vacc1x01234567, vmultiplier), vrounding);
@@ -132,29 +132,29 @@
     const __m256i vq31prod0x01234567 = _mm256_blend_epi16(vq31prod0x0246, vq31prod0x1357, 0xCC);
     const __m256i vq31prod1x01234567 = _mm256_blend_epi16(vq31prod1x0246, vq31prod1x1357, 0xCC);
 
-    const __m256i vremainder_mask = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_mask));
+    const __m256i vremainder_mask = _mm256_load_si256((const __m256i*) params->avx2.remainder_mask);
     const __m256i vrem0x01234567 =
       _mm256_add_epi32(_mm256_and_si256(vq31prod0x01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod0x01234567));
     const __m256i vrem1x01234567 =
       _mm256_add_epi32(_mm256_and_si256(vq31prod1x01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod1x01234567));
 
-    const __m256i vremainder_threshold = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_threshold));
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m256i vremainder_threshold = _mm256_load_si256((const __m256i*) params->avx2.remainder_threshold);
+    const __m128i vshift = _mm_load_si128((const __m128i*) params->avx2.shift);
     vacc0x01234567 =
       _mm256_sub_epi32(_mm256_sra_epi32(vq31prod0x01234567, vshift), _mm256_cmpgt_epi32(vrem0x01234567, vremainder_threshold));
     vacc1x01234567 =
       _mm256_sub_epi32(_mm256_sra_epi32(vq31prod1x01234567, vshift), _mm256_cmpgt_epi32(vrem1x01234567, vremainder_threshold));
 
-    const __m256i voutput_zero_point = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_zero_point));
+    const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx2.output_zero_point);
     __m256i vacc01x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc0x01234567, vacc1x01234567), voutput_zero_point);
 
     vacc01x01234567 = _mm256_permute4x64_epi64(vacc01x01234567, _MM_SHUFFLE(3, 1, 2, 0));
 
-    const __m256i voutput_min = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_min));
-    const __m256i voutput_max = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_max));
-    vacc01x01234567 = _mm256_min_epi16(_mm256_max_epi16(vacc01x01234567, voutput_min), voutput_max);
-
     __m256i vout = _mm256_packs_epi16(vacc01x01234567, vacc01x01234567);
+
+    vout = _mm256_max_epi8(vout, _mm256_load_si256((const __m256i*) params->avx2.output_min));
+    vout = _mm256_min_epi8(vout, _mm256_load_si256((const __m256i*) params->avx2.output_max));
+
     __m128i vout_lo = _mm256_castsi256_si128(vout);
     __m128i vout_hi = _mm256_extracti128_si256(vout, 1);
 
diff --git a/src/qs8-gemm/gen/2x8c8-xw-minmax-avx2.c b/src/qs8-gemm/gen/2x8c8-xw-minmax-avx2.c
index 71f7c04..f4e707b 100644
--- a/src/qs8-gemm/gen/2x8c8-xw-minmax-avx2.c
+++ b/src/qs8-gemm/gen/2x8c8-xw-minmax-avx2.c
@@ -108,11 +108,11 @@
     __m256i vacc0x01234567 = _mm256_permutevar8x32_epi32(vacc0x02461357, vpermute_mask);
     __m256i vacc1x01234567 = _mm256_permutevar8x32_epi32(vacc1x02461357, vpermute_mask);
 
-    const __m256i vmultiplier = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.multiplier));
-    const __m256i vrounding = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.rounding));
+    const __m256i vmultiplier = _mm256_load_si256((const __m256i*) params->avx2.multiplier);
+    const __m256i vrounding = _mm256_load_si256((const __m256i*) params->avx2.rounding);
 
-    const __m256i vacc0x11335577 = _mm256_shuffle_epi32(vacc0x01234567, _MM_SHUFFLE(3, 3, 1, 1));
-    const __m256i vacc1x11335577 = _mm256_shuffle_epi32(vacc1x01234567, _MM_SHUFFLE(3, 3, 1, 1));
+    const __m256i vacc0x11335577 = _mm256_srli_epi64(vacc0x01234567, 32);
+    const __m256i vacc1x11335577 = _mm256_srli_epi64(vacc1x01234567, 32);
 
     const __m256i vprod0x0246 = _mm256_add_epi64(_mm256_mul_epi32(vacc0x01234567, vmultiplier), vrounding);
     const __m256i vprod1x0246 = _mm256_add_epi64(_mm256_mul_epi32(vacc1x01234567, vmultiplier), vrounding);
@@ -128,29 +128,29 @@
     const __m256i vq31prod0x01234567 = _mm256_blend_epi16(vq31prod0x0246, vq31prod0x1357, 0xCC);
     const __m256i vq31prod1x01234567 = _mm256_blend_epi16(vq31prod1x0246, vq31prod1x1357, 0xCC);
 
-    const __m256i vremainder_mask = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_mask));
+    const __m256i vremainder_mask = _mm256_load_si256((const __m256i*) params->avx2.remainder_mask);
     const __m256i vrem0x01234567 =
       _mm256_add_epi32(_mm256_and_si256(vq31prod0x01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod0x01234567));
     const __m256i vrem1x01234567 =
       _mm256_add_epi32(_mm256_and_si256(vq31prod1x01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod1x01234567));
 
-    const __m256i vremainder_threshold = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_threshold));
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m256i vremainder_threshold = _mm256_load_si256((const __m256i*) params->avx2.remainder_threshold);
+    const __m128i vshift = _mm_load_si128((const __m128i*) params->avx2.shift);
     vacc0x01234567 =
       _mm256_sub_epi32(_mm256_sra_epi32(vq31prod0x01234567, vshift), _mm256_cmpgt_epi32(vrem0x01234567, vremainder_threshold));
     vacc1x01234567 =
       _mm256_sub_epi32(_mm256_sra_epi32(vq31prod1x01234567, vshift), _mm256_cmpgt_epi32(vrem1x01234567, vremainder_threshold));
 
-    const __m256i voutput_zero_point = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_zero_point));
+    const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx2.output_zero_point);
     __m256i vacc01x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc0x01234567, vacc1x01234567), voutput_zero_point);
 
     vacc01x01234567 = _mm256_permute4x64_epi64(vacc01x01234567, _MM_SHUFFLE(3, 1, 2, 0));
 
-    const __m256i voutput_min = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_min));
-    const __m256i voutput_max = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_max));
-    vacc01x01234567 = _mm256_min_epi16(_mm256_max_epi16(vacc01x01234567, voutput_min), voutput_max);
-
     __m256i vout = _mm256_packs_epi16(vacc01x01234567, vacc01x01234567);
+
+    vout = _mm256_max_epi8(vout, _mm256_load_si256((const __m256i*) params->avx2.output_min));
+    vout = _mm256_min_epi8(vout, _mm256_load_si256((const __m256i*) params->avx2.output_max));
+
     __m128i vout_lo = _mm256_castsi256_si128(vout);
     __m128i vout_hi = _mm256_extracti128_si256(vout, 1);
 
diff --git a/src/qs8-gemm/gen/3x16c8-minmax-avx512skx.c b/src/qs8-gemm/gen/3x16c8-minmax-avx512skx.c
index 03a173c..b6d6a6d 100644
--- a/src/qs8-gemm/gen/3x16c8-minmax-avx512skx.c
+++ b/src/qs8-gemm/gen/3x16c8-minmax-avx512skx.c
@@ -55,14 +55,14 @@
 
   const __mmask16 vbias_mask = _cvtu32_mask16(0x1111);
   const __mmask16 vblend_mask = _cvtu32_mask16(0xAAAA);
-  const __m512i vmultiplier = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse2.multiplier));
-  const __m512i vrounding = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse2.rounding));
-  const __m512i vremainder_mask = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse2.remainder_mask));
-  const __m512i vremainder_threshold = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse2.remainder_threshold));
-  const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
-  const __m512i voutput_zero_point = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse2.output_zero_point));
-  const __m512i voutput_min = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse2.output_min));
-  const __m512i voutput_max = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse2.output_max));
+  const __m512i vmultiplier = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse4.multiplier));
+  const __m512i vrounding = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse4.rounding));
+  const __m512i vremainder_mask = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse4.remainder_mask));
+  const __m512i vremainder_threshold = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse4.remainder_threshold));
+  const __m128i vshift = _mm_load_si128((const __m128i*) params->sse4.shift);
+  const __m512i voutput_zero_point = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse4.output_zero_point));
+  const __m512i voutput_min = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse4.output_min));
+  const __m512i voutput_max = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse4.output_max));
   do {
     __m512i vacc0x0123 = _mm512_maskz_expandloadu_epi32(vbias_mask, w);
     __m512i vacc0x4567 = _mm512_maskz_expandloadu_epi32(vbias_mask, (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)));
@@ -165,15 +165,14 @@
     vacc2x084C195D2A6E3B7F =
       _mm512_mask_sub_epi32(vacc2x084C195D2A6E3B7F, _mm512_cmpgt_epi32_mask(vrem2x084C195D2A6E3B7F, vremainder_threshold), vacc2x084C195D2A6E3B7F, vminus_one);
 
-    __m512i vacc01x084Cx195Dx2A6Ex3B7F = _mm512_adds_epi16(_mm512_packs_epi32(vacc0x084C195D2A6E3B7F, vacc1x084C195D2A6E3B7F), voutput_zero_point);
-    __m512i vacc22x084Cx195Dx2A6Ex3B7F = _mm512_adds_epi16(_mm512_packs_epi32(vacc2x084C195D2A6E3B7F, vacc2x084C195D2A6E3B7F), voutput_zero_point);
-
-    vacc01x084Cx195Dx2A6Ex3B7F = _mm512_min_epi16(_mm512_max_epi16(vacc01x084Cx195Dx2A6Ex3B7F, voutput_min), voutput_max);
-    vacc22x084Cx195Dx2A6Ex3B7F = _mm512_min_epi16(_mm512_max_epi16(vacc22x084Cx195Dx2A6Ex3B7F, voutput_min), voutput_max);
+    const __m512i vacc01x084Cx195Dx2A6Ex3B7F = _mm512_adds_epi16(_mm512_packs_epi32(vacc0x084C195D2A6E3B7F, vacc1x084C195D2A6E3B7F), voutput_zero_point);
+    const __m512i vacc22x084Cx195Dx2A6Ex3B7F = _mm512_adds_epi16(_mm512_packs_epi32(vacc2x084C195D2A6E3B7F, vacc2x084C195D2A6E3B7F), voutput_zero_point);
 
     __m512i vout0122x084Cx195Dx2A6Ex3B7F = _mm512_packs_epi16(vacc01x084Cx195Dx2A6Ex3B7F, vacc22x084Cx195Dx2A6Ex3B7F);
     vout0122x084Cx195Dx2A6Ex3B7F = _mm512_permutexvar_epi32(_mm512_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0), vout0122x084Cx195Dx2A6Ex3B7F);
-    const __m512i vout0122x0123456789ABCDEF = _mm512_shuffle_epi8(vout0122x084Cx195Dx2A6Ex3B7F, _mm512_set_epi8(15, 11, 7, 3, 13, 9, 5, 1, 14, 10, 6, 2, 12, 8, 4, 0, 15, 11, 7, 3, 13, 9, 5, 1, 14, 10, 6, 2, 12, 8, 4, 0, 15, 11, 7, 3, 13, 9, 5, 1, 14, 10, 6, 2, 12, 8, 4, 0, 15, 11, 7, 3, 13, 9, 5, 1, 14, 10, 6, 2, 12, 8, 4, 0));
+    __m512i vout0122x0123456789ABCDEF = _mm512_shuffle_epi8(vout0122x084Cx195Dx2A6Ex3B7F, _mm512_set_epi8(15, 11, 7, 3, 13, 9, 5, 1, 14, 10, 6, 2, 12, 8, 4, 0, 15, 11, 7, 3, 13, 9, 5, 1, 14, 10, 6, 2, 12, 8, 4, 0, 15, 11, 7, 3, 13, 9, 5, 1, 14, 10, 6, 2, 12, 8, 4, 0, 15, 11, 7, 3, 13, 9, 5, 1, 14, 10, 6, 2, 12, 8, 4, 0));
+    vout0122x0123456789ABCDEF = _mm512_max_epi8(vout0122x0123456789ABCDEF, voutput_min);
+    vout0122x0123456789ABCDEF = _mm512_min_epi8(vout0122x0123456789ABCDEF, voutput_max);
 
     if (nc >= 16) {
       _mm_storeu_si128((__m128i*) c0, _mm512_castsi512_si128(vout0122x0123456789ABCDEF));
diff --git a/src/qs8-gemm/gen/3x4c2-minmax-avx-ld128.c b/src/qs8-gemm/gen/3x4c2-minmax-avx-ld128.c
index 99c9140..06a9192 100644
--- a/src/qs8-gemm/gen/3x4c2-minmax-avx-ld128.c
+++ b/src/qs8-gemm/gen/3x4c2-minmax-avx-ld128.c
@@ -159,8 +159,8 @@
       }
     }
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -185,7 +185,7 @@
     const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
     const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
@@ -193,8 +193,8 @@
     const __m128i vrem2x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -202,17 +202,16 @@
     vacc2x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod2x0123, vshift), _mm_cmpgt_epi32(vrem2x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
     __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
-    vacc22x0123 = _mm_min_epi16(_mm_max_epi16(vacc22x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
diff --git a/src/qs8-gemm/gen/3x4c2-minmax-avx-ld64.c b/src/qs8-gemm/gen/3x4c2-minmax-avx-ld64.c
index db03795..cff57f6 100644
--- a/src/qs8-gemm/gen/3x4c2-minmax-avx-ld64.c
+++ b/src/qs8-gemm/gen/3x4c2-minmax-avx-ld64.c
@@ -159,8 +159,8 @@
       }
     }
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -185,7 +185,7 @@
     const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
     const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
@@ -193,8 +193,8 @@
     const __m128i vrem2x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -202,17 +202,16 @@
     vacc2x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod2x0123, vshift), _mm_cmpgt_epi32(vrem2x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
     __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
-    vacc22x0123 = _mm_min_epi16(_mm_max_epi16(vacc22x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
diff --git a/src/qs8-gemm/gen/3x4c2-minmax-sse2-ld128.c b/src/qs8-gemm/gen/3x4c2-minmax-sse2-ld128.c
index 026b9e2..a3e6c9f 100644
--- a/src/qs8-gemm/gen/3x4c2-minmax-sse2-ld128.c
+++ b/src/qs8-gemm/gen/3x4c2-minmax-sse2-ld128.c
@@ -226,7 +226,7 @@
       _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
 
     const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse2.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -245,6 +245,7 @@
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
 
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       vout = _mm_srli_si128(vout, 4);
diff --git a/src/qs8-gemm/gen/3x4c2-minmax-sse2-ld64.c b/src/qs8-gemm/gen/3x4c2-minmax-sse2-ld64.c
index c5d590f..a6a69b4 100644
--- a/src/qs8-gemm/gen/3x4c2-minmax-sse2-ld64.c
+++ b/src/qs8-gemm/gen/3x4c2-minmax-sse2-ld64.c
@@ -226,7 +226,7 @@
       _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
 
     const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse2.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -245,6 +245,7 @@
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
 
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       vout = _mm_srli_si128(vout, 4);
diff --git a/src/qs8-gemm/gen/3x4c2-minmax-sse41-ld128.c b/src/qs8-gemm/gen/3x4c2-minmax-sse41-ld128.c
index 79e7aeb..396470b 100644
--- a/src/qs8-gemm/gen/3x4c2-minmax-sse41-ld128.c
+++ b/src/qs8-gemm/gen/3x4c2-minmax-sse41-ld128.c
@@ -159,8 +159,8 @@
       }
     }
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -185,7 +185,7 @@
     const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
     const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
@@ -193,8 +193,8 @@
     const __m128i vrem2x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -202,17 +202,16 @@
     vacc2x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod2x0123, vshift), _mm_cmpgt_epi32(vrem2x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
     __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
-    vacc22x0123 = _mm_min_epi16(_mm_max_epi16(vacc22x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
diff --git a/src/qs8-gemm/gen/3x4c2-minmax-sse41-ld64.c b/src/qs8-gemm/gen/3x4c2-minmax-sse41-ld64.c
index 63d8d88..227d609 100644
--- a/src/qs8-gemm/gen/3x4c2-minmax-sse41-ld64.c
+++ b/src/qs8-gemm/gen/3x4c2-minmax-sse41-ld64.c
@@ -159,8 +159,8 @@
       }
     }
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -185,7 +185,7 @@
     const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
     const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
@@ -193,8 +193,8 @@
     const __m128i vrem2x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -202,17 +202,16 @@
     vacc2x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod2x0123, vshift), _mm_cmpgt_epi32(vrem2x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
     __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
-    vacc22x0123 = _mm_min_epi16(_mm_max_epi16(vacc22x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
diff --git a/src/qs8-gemm/gen/3x4c2-minmax-ssse3-ld128.c b/src/qs8-gemm/gen/3x4c2-minmax-ssse3-ld128.c
index 2d2cd3e..306b6e5 100644
--- a/src/qs8-gemm/gen/3x4c2-minmax-ssse3-ld128.c
+++ b/src/qs8-gemm/gen/3x4c2-minmax-ssse3-ld128.c
@@ -226,7 +226,7 @@
       _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
 
     const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse2.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -245,6 +245,7 @@
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
 
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       vout = _mm_srli_si128(vout, 4);
diff --git a/src/qs8-gemm/gen/3x4c2-minmax-ssse3-ld64.c b/src/qs8-gemm/gen/3x4c2-minmax-ssse3-ld64.c
index 171dd4e..664fd04 100644
--- a/src/qs8-gemm/gen/3x4c2-minmax-ssse3-ld64.c
+++ b/src/qs8-gemm/gen/3x4c2-minmax-ssse3-ld64.c
@@ -226,7 +226,7 @@
       _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
 
     const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse2.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -245,6 +245,7 @@
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
 
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       vout = _mm_srli_si128(vout, 4);
diff --git a/src/qs8-gemm/gen/3x4c2-minmax-xop-ld128.c b/src/qs8-gemm/gen/3x4c2-minmax-xop-ld128.c
index 3ee5466..f599282 100644
--- a/src/qs8-gemm/gen/3x4c2-minmax-xop-ld128.c
+++ b/src/qs8-gemm/gen/3x4c2-minmax-xop-ld128.c
@@ -164,8 +164,8 @@
       }
     }
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -190,7 +190,7 @@
     const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
     const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
@@ -198,8 +198,8 @@
     const __m128i vrem2x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -207,17 +207,16 @@
     vacc2x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod2x0123, vshift), _mm_cmpgt_epi32(vrem2x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
     __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
-    vacc22x0123 = _mm_min_epi16(_mm_max_epi16(vacc22x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
diff --git a/src/qs8-gemm/gen/3x4c2-minmax-xop-ld64.c b/src/qs8-gemm/gen/3x4c2-minmax-xop-ld64.c
index 1281aa0..c7a4ceb 100644
--- a/src/qs8-gemm/gen/3x4c2-minmax-xop-ld64.c
+++ b/src/qs8-gemm/gen/3x4c2-minmax-xop-ld64.c
@@ -164,8 +164,8 @@
       }
     }
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -190,7 +190,7 @@
     const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
     const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
@@ -198,8 +198,8 @@
     const __m128i vrem2x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -207,17 +207,16 @@
     vacc2x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod2x0123, vshift), _mm_cmpgt_epi32(vrem2x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
     __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
-    vacc22x0123 = _mm_min_epi16(_mm_max_epi16(vacc22x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
diff --git a/src/qs8-gemm/gen/3x4c2-xw-minmax-avx.c b/src/qs8-gemm/gen/3x4c2-xw-minmax-avx.c
index f9b6311..0f5d9b1 100644
--- a/src/qs8-gemm/gen/3x4c2-xw-minmax-avx.c
+++ b/src/qs8-gemm/gen/3x4c2-xw-minmax-avx.c
@@ -152,8 +152,8 @@
       }
     }
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -178,7 +178,7 @@
     const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
     const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
@@ -186,8 +186,8 @@
     const __m128i vrem2x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -195,17 +195,16 @@
     vacc2x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod2x0123, vshift), _mm_cmpgt_epi32(vrem2x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
     __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
-    vacc22x0123 = _mm_min_epi16(_mm_max_epi16(vacc22x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
diff --git a/src/qs8-gemm/gen/3x4c2-xw-minmax-sse2.c b/src/qs8-gemm/gen/3x4c2-xw-minmax-sse2.c
index 51dd9bf..0b0c916 100644
--- a/src/qs8-gemm/gen/3x4c2-xw-minmax-sse2.c
+++ b/src/qs8-gemm/gen/3x4c2-xw-minmax-sse2.c
@@ -219,7 +219,7 @@
       _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
 
     const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse2.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -238,6 +238,7 @@
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
 
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       vout = _mm_srli_si128(vout, 4);
diff --git a/src/qs8-gemm/gen/3x4c2-xw-minmax-sse41.c b/src/qs8-gemm/gen/3x4c2-xw-minmax-sse41.c
index b88c263..c4b2953 100644
--- a/src/qs8-gemm/gen/3x4c2-xw-minmax-sse41.c
+++ b/src/qs8-gemm/gen/3x4c2-xw-minmax-sse41.c
@@ -152,8 +152,8 @@
       }
     }
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -178,7 +178,7 @@
     const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
     const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
@@ -186,8 +186,8 @@
     const __m128i vrem2x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -195,17 +195,16 @@
     vacc2x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod2x0123, vshift), _mm_cmpgt_epi32(vrem2x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
     __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
-    vacc22x0123 = _mm_min_epi16(_mm_max_epi16(vacc22x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
diff --git a/src/qs8-gemm/gen/3x4c2-xw-minmax-ssse3.c b/src/qs8-gemm/gen/3x4c2-xw-minmax-ssse3.c
index 2806362..4cd3e0d 100644
--- a/src/qs8-gemm/gen/3x4c2-xw-minmax-ssse3.c
+++ b/src/qs8-gemm/gen/3x4c2-xw-minmax-ssse3.c
@@ -219,7 +219,7 @@
       _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
 
     const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse2.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -238,6 +238,7 @@
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
 
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       vout = _mm_srli_si128(vout, 4);
diff --git a/src/qs8-gemm/gen/3x4c2-xw-minmax-xop.c b/src/qs8-gemm/gen/3x4c2-xw-minmax-xop.c
index c72ebdb..86d4b3f 100644
--- a/src/qs8-gemm/gen/3x4c2-xw-minmax-xop.c
+++ b/src/qs8-gemm/gen/3x4c2-xw-minmax-xop.c
@@ -157,8 +157,8 @@
       }
     }
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -183,7 +183,7 @@
     const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
     const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
@@ -191,8 +191,8 @@
     const __m128i vrem2x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -200,17 +200,16 @@
     vacc2x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod2x0123, vshift), _mm_cmpgt_epi32(vrem2x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
     __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
-    vacc22x0123 = _mm_min_epi16(_mm_max_epi16(vacc22x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
diff --git a/src/qs8-gemm/gen/3x4c8-minmax-avx-ld128.c b/src/qs8-gemm/gen/3x4c8-minmax-avx-ld128.c
index abcedde..a2bf9d9 100644
--- a/src/qs8-gemm/gen/3x4c8-minmax-avx-ld128.c
+++ b/src/qs8-gemm/gen/3x4c8-minmax-avx-ld128.c
@@ -117,8 +117,8 @@
     __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
     __m128i vacc2x0123 = _mm_hadd_epi32(vacc2x01, vacc2x23);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -143,7 +143,7 @@
     const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
     const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
@@ -151,8 +151,8 @@
     const __m128i vrem2x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -160,17 +160,16 @@
     vacc2x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod2x0123, vshift), _mm_cmpgt_epi32(vrem2x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
     __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
-    vacc22x0123 = _mm_min_epi16(_mm_max_epi16(vacc22x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
diff --git a/src/qs8-gemm/gen/3x4c8-minmax-avx-ld64.c b/src/qs8-gemm/gen/3x4c8-minmax-avx-ld64.c
index a92f659..72b3074 100644
--- a/src/qs8-gemm/gen/3x4c8-minmax-avx-ld64.c
+++ b/src/qs8-gemm/gen/3x4c8-minmax-avx-ld64.c
@@ -119,8 +119,8 @@
     __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
     __m128i vacc2x0123 = _mm_hadd_epi32(vacc2x01, vacc2x23);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -145,7 +145,7 @@
     const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
     const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
@@ -153,8 +153,8 @@
     const __m128i vrem2x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -162,17 +162,16 @@
     vacc2x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod2x0123, vshift), _mm_cmpgt_epi32(vrem2x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
     __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
-    vacc22x0123 = _mm_min_epi16(_mm_max_epi16(vacc22x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
diff --git a/src/qs8-gemm/gen/3x4c8-minmax-sse2-ld128.c b/src/qs8-gemm/gen/3x4c8-minmax-sse2-ld128.c
index 9bc9881..0f57e0f 100644
--- a/src/qs8-gemm/gen/3x4c8-minmax-sse2-ld128.c
+++ b/src/qs8-gemm/gen/3x4c8-minmax-sse2-ld128.c
@@ -184,7 +184,7 @@
       _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
 
     const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse2.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -203,6 +203,7 @@
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
 
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       vout = _mm_srli_si128(vout, 4);
diff --git a/src/qs8-gemm/gen/3x4c8-minmax-sse2-ld64.c b/src/qs8-gemm/gen/3x4c8-minmax-sse2-ld64.c
index 7db069f..10fa7cd 100644
--- a/src/qs8-gemm/gen/3x4c8-minmax-sse2-ld64.c
+++ b/src/qs8-gemm/gen/3x4c8-minmax-sse2-ld64.c
@@ -186,7 +186,7 @@
       _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
 
     const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse2.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -205,6 +205,7 @@
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
 
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       vout = _mm_srli_si128(vout, 4);
diff --git a/src/qs8-gemm/gen/3x4c8-minmax-sse41-ld128.c b/src/qs8-gemm/gen/3x4c8-minmax-sse41-ld128.c
index 5d1bf8f..6d8579c 100644
--- a/src/qs8-gemm/gen/3x4c8-minmax-sse41-ld128.c
+++ b/src/qs8-gemm/gen/3x4c8-minmax-sse41-ld128.c
@@ -117,8 +117,8 @@
     __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
     __m128i vacc2x0123 = _mm_hadd_epi32(vacc2x01, vacc2x23);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -143,7 +143,7 @@
     const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
     const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
@@ -151,8 +151,8 @@
     const __m128i vrem2x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -160,17 +160,16 @@
     vacc2x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod2x0123, vshift), _mm_cmpgt_epi32(vrem2x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
     __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
-    vacc22x0123 = _mm_min_epi16(_mm_max_epi16(vacc22x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
diff --git a/src/qs8-gemm/gen/3x4c8-minmax-sse41-ld64.c b/src/qs8-gemm/gen/3x4c8-minmax-sse41-ld64.c
index 7866238..62b0ba2 100644
--- a/src/qs8-gemm/gen/3x4c8-minmax-sse41-ld64.c
+++ b/src/qs8-gemm/gen/3x4c8-minmax-sse41-ld64.c
@@ -119,8 +119,8 @@
     __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
     __m128i vacc2x0123 = _mm_hadd_epi32(vacc2x01, vacc2x23);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -145,7 +145,7 @@
     const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
     const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
@@ -153,8 +153,8 @@
     const __m128i vrem2x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -162,17 +162,16 @@
     vacc2x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod2x0123, vshift), _mm_cmpgt_epi32(vrem2x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
     __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
-    vacc22x0123 = _mm_min_epi16(_mm_max_epi16(vacc22x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
diff --git a/src/qs8-gemm/gen/3x4c8-minmax-ssse3-ld128.c b/src/qs8-gemm/gen/3x4c8-minmax-ssse3-ld128.c
index 689ea6f..81d0de0 100644
--- a/src/qs8-gemm/gen/3x4c8-minmax-ssse3-ld128.c
+++ b/src/qs8-gemm/gen/3x4c8-minmax-ssse3-ld128.c
@@ -184,7 +184,7 @@
       _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
 
     const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse2.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -203,6 +203,7 @@
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
 
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       vout = _mm_srli_si128(vout, 4);
diff --git a/src/qs8-gemm/gen/3x4c8-minmax-ssse3-ld64.c b/src/qs8-gemm/gen/3x4c8-minmax-ssse3-ld64.c
index b545041..fa0121b 100644
--- a/src/qs8-gemm/gen/3x4c8-minmax-ssse3-ld64.c
+++ b/src/qs8-gemm/gen/3x4c8-minmax-ssse3-ld64.c
@@ -186,7 +186,7 @@
       _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
 
     const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse2.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -205,6 +205,7 @@
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
 
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       vout = _mm_srli_si128(vout, 4);
diff --git a/src/qs8-gemm/gen/3x4c8-minmax-xop-ld128.c b/src/qs8-gemm/gen/3x4c8-minmax-xop-ld128.c
index f1dd230..4a09afe 100644
--- a/src/qs8-gemm/gen/3x4c8-minmax-xop-ld128.c
+++ b/src/qs8-gemm/gen/3x4c8-minmax-xop-ld128.c
@@ -122,8 +122,8 @@
     __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
     __m128i vacc2x0123 = _mm_hadd_epi32(vacc2x01, vacc2x23);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -148,7 +148,7 @@
     const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
     const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
@@ -156,8 +156,8 @@
     const __m128i vrem2x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -165,17 +165,16 @@
     vacc2x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod2x0123, vshift), _mm_cmpgt_epi32(vrem2x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
     __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
-    vacc22x0123 = _mm_min_epi16(_mm_max_epi16(vacc22x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
diff --git a/src/qs8-gemm/gen/3x4c8-minmax-xop-ld64.c b/src/qs8-gemm/gen/3x4c8-minmax-xop-ld64.c
index 9f91256..0c874fc 100644
--- a/src/qs8-gemm/gen/3x4c8-minmax-xop-ld64.c
+++ b/src/qs8-gemm/gen/3x4c8-minmax-xop-ld64.c
@@ -124,8 +124,8 @@
     __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
     __m128i vacc2x0123 = _mm_hadd_epi32(vacc2x01, vacc2x23);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -150,7 +150,7 @@
     const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
     const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
@@ -158,8 +158,8 @@
     const __m128i vrem2x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -167,17 +167,16 @@
     vacc2x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod2x0123, vshift), _mm_cmpgt_epi32(vrem2x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
     __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
-    vacc22x0123 = _mm_min_epi16(_mm_max_epi16(vacc22x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
diff --git a/src/qs8-gemm/gen/3x4c8-xw-minmax-avx.c b/src/qs8-gemm/gen/3x4c8-xw-minmax-avx.c
index 8238c2d..a4192f6 100644
--- a/src/qs8-gemm/gen/3x4c8-xw-minmax-avx.c
+++ b/src/qs8-gemm/gen/3x4c8-xw-minmax-avx.c
@@ -115,8 +115,8 @@
     __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
     __m128i vacc2x0123 = _mm_hadd_epi32(vacc2x01, vacc2x23);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -141,7 +141,7 @@
     const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
     const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
@@ -149,8 +149,8 @@
     const __m128i vrem2x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -158,17 +158,16 @@
     vacc2x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod2x0123, vshift), _mm_cmpgt_epi32(vrem2x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
     __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
-    vacc22x0123 = _mm_min_epi16(_mm_max_epi16(vacc22x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
diff --git a/src/qs8-gemm/gen/3x4c8-xw-minmax-sse2.c b/src/qs8-gemm/gen/3x4c8-xw-minmax-sse2.c
index 51d77de..4d4d116 100644
--- a/src/qs8-gemm/gen/3x4c8-xw-minmax-sse2.c
+++ b/src/qs8-gemm/gen/3x4c8-xw-minmax-sse2.c
@@ -182,7 +182,7 @@
       _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
 
     const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse2.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -201,6 +201,7 @@
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
 
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       vout = _mm_srli_si128(vout, 4);
diff --git a/src/qs8-gemm/gen/3x4c8-xw-minmax-sse41.c b/src/qs8-gemm/gen/3x4c8-xw-minmax-sse41.c
index ac20132..a00a4ca 100644
--- a/src/qs8-gemm/gen/3x4c8-xw-minmax-sse41.c
+++ b/src/qs8-gemm/gen/3x4c8-xw-minmax-sse41.c
@@ -115,8 +115,8 @@
     __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
     __m128i vacc2x0123 = _mm_hadd_epi32(vacc2x01, vacc2x23);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -141,7 +141,7 @@
     const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
     const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
@@ -149,8 +149,8 @@
     const __m128i vrem2x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -158,17 +158,16 @@
     vacc2x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod2x0123, vshift), _mm_cmpgt_epi32(vrem2x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
     __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
-    vacc22x0123 = _mm_min_epi16(_mm_max_epi16(vacc22x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
diff --git a/src/qs8-gemm/gen/3x4c8-xw-minmax-ssse3.c b/src/qs8-gemm/gen/3x4c8-xw-minmax-ssse3.c
index 80eae05..13681e3 100644
--- a/src/qs8-gemm/gen/3x4c8-xw-minmax-ssse3.c
+++ b/src/qs8-gemm/gen/3x4c8-xw-minmax-ssse3.c
@@ -182,7 +182,7 @@
       _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
 
     const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse2.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -201,6 +201,7 @@
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
 
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       vout = _mm_srli_si128(vout, 4);
diff --git a/src/qs8-gemm/gen/3x4c8-xw-minmax-xop.c b/src/qs8-gemm/gen/3x4c8-xw-minmax-xop.c
index c8fa729..e471b8e 100644
--- a/src/qs8-gemm/gen/3x4c8-xw-minmax-xop.c
+++ b/src/qs8-gemm/gen/3x4c8-xw-minmax-xop.c
@@ -120,8 +120,8 @@
     __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
     __m128i vacc2x0123 = _mm_hadd_epi32(vacc2x01, vacc2x23);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -146,7 +146,7 @@
     const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
     const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
@@ -154,8 +154,8 @@
     const __m128i vrem2x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -163,17 +163,16 @@
     vacc2x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod2x0123, vshift), _mm_cmpgt_epi32(vrem2x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
     __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
-    vacc22x0123 = _mm_min_epi16(_mm_max_epi16(vacc22x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
diff --git a/src/qs8-gemm/gen/3x8c8-minmax-avx2.c b/src/qs8-gemm/gen/3x8c8-minmax-avx2.c
index fd0ce85..d41e047 100644
--- a/src/qs8-gemm/gen/3x8c8-minmax-avx2.c
+++ b/src/qs8-gemm/gen/3x8c8-minmax-avx2.c
@@ -133,12 +133,12 @@
     __m256i vacc1x01234567 = _mm256_permutevar8x32_epi32(vacc1x02461357, vpermute_mask);
     __m256i vacc2x01234567 = _mm256_permutevar8x32_epi32(vacc2x02461357, vpermute_mask);
 
-    const __m256i vmultiplier = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.multiplier));
-    const __m256i vrounding = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.rounding));
+    const __m256i vmultiplier = _mm256_load_si256((const __m256i*) params->avx2.multiplier);
+    const __m256i vrounding = _mm256_load_si256((const __m256i*) params->avx2.rounding);
 
-    const __m256i vacc0x11335577 = _mm256_shuffle_epi32(vacc0x01234567, _MM_SHUFFLE(3, 3, 1, 1));
-    const __m256i vacc1x11335577 = _mm256_shuffle_epi32(vacc1x01234567, _MM_SHUFFLE(3, 3, 1, 1));
-    const __m256i vacc2x11335577 = _mm256_shuffle_epi32(vacc2x01234567, _MM_SHUFFLE(3, 3, 1, 1));
+    const __m256i vacc0x11335577 = _mm256_srli_epi64(vacc0x01234567, 32);
+    const __m256i vacc1x11335577 = _mm256_srli_epi64(vacc1x01234567, 32);
+    const __m256i vacc2x11335577 = _mm256_srli_epi64(vacc2x01234567, 32);
 
     const __m256i vprod0x0246 = _mm256_add_epi64(_mm256_mul_epi32(vacc0x01234567, vmultiplier), vrounding);
     const __m256i vprod1x0246 = _mm256_add_epi64(_mm256_mul_epi32(vacc1x01234567, vmultiplier), vrounding);
@@ -159,7 +159,7 @@
     const __m256i vq31prod1x01234567 = _mm256_blend_epi16(vq31prod1x0246, vq31prod1x1357, 0xCC);
     const __m256i vq31prod2x01234567 = _mm256_blend_epi16(vq31prod2x0246, vq31prod2x1357, 0xCC);
 
-    const __m256i vremainder_mask = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_mask));
+    const __m256i vremainder_mask = _mm256_load_si256((const __m256i*) params->avx2.remainder_mask);
     const __m256i vrem0x01234567 =
       _mm256_add_epi32(_mm256_and_si256(vq31prod0x01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod0x01234567));
     const __m256i vrem1x01234567 =
@@ -167,8 +167,8 @@
     const __m256i vrem2x01234567 =
       _mm256_add_epi32(_mm256_and_si256(vq31prod2x01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod2x01234567));
 
-    const __m256i vremainder_threshold = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_threshold));
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m256i vremainder_threshold = _mm256_load_si256((const __m256i*) params->avx2.remainder_threshold);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->avx2.shift);
     vacc0x01234567 =
       _mm256_sub_epi32(_mm256_sra_epi32(vq31prod0x01234567, vshift), _mm256_cmpgt_epi32(vrem0x01234567, vremainder_threshold));
     vacc1x01234567 =
@@ -176,19 +176,18 @@
     vacc2x01234567 =
       _mm256_sub_epi32(_mm256_sra_epi32(vq31prod2x01234567, vshift), _mm256_cmpgt_epi32(vrem2x01234567, vremainder_threshold));
 
-    const __m256i voutput_zero_point = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_zero_point));
+    const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx2.output_zero_point);
     __m256i vacc01x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc0x01234567, vacc1x01234567), voutput_zero_point);
     __m256i vacc22x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc2x01234567, vacc2x01234567), voutput_zero_point);
 
     vacc01x01234567 = _mm256_permute4x64_epi64(vacc01x01234567, _MM_SHUFFLE(3, 1, 2, 0));
     vacc22x01234567 = _mm256_permute4x64_epi64(vacc22x01234567, _MM_SHUFFLE(3, 1, 2, 0));
 
-    const __m256i voutput_min = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_min));
-    const __m256i voutput_max = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_max));
-    vacc01x01234567 = _mm256_min_epi16(_mm256_max_epi16(vacc01x01234567, voutput_min), voutput_max);
-    vacc22x01234567 = _mm256_min_epi16(_mm256_max_epi16(vacc22x01234567, voutput_min), voutput_max);
-
     __m256i vout = _mm256_packs_epi16(vacc01x01234567, vacc22x01234567);
+
+    vout = _mm256_max_epi8(vout, _mm256_load_si256((const __m256i*) params->avx2.output_min));
+    vout = _mm256_min_epi8(vout, _mm256_load_si256((const __m256i*) params->avx2.output_max));
+
     __m128i vout_lo = _mm256_castsi256_si128(vout);
     __m128i vout_hi = _mm256_extracti128_si256(vout, 1);
 
diff --git a/src/qs8-gemm/gen/3x8c8-xw-minmax-avx2.c b/src/qs8-gemm/gen/3x8c8-xw-minmax-avx2.c
index a84d8c4..d589e3c 100644
--- a/src/qs8-gemm/gen/3x8c8-xw-minmax-avx2.c
+++ b/src/qs8-gemm/gen/3x8c8-xw-minmax-avx2.c
@@ -129,12 +129,12 @@
     __m256i vacc1x01234567 = _mm256_permutevar8x32_epi32(vacc1x02461357, vpermute_mask);
     __m256i vacc2x01234567 = _mm256_permutevar8x32_epi32(vacc2x02461357, vpermute_mask);
 
-    const __m256i vmultiplier = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.multiplier));
-    const __m256i vrounding = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.rounding));
+    const __m256i vmultiplier = _mm256_load_si256((const __m256i*) params->avx2.multiplier);
+    const __m256i vrounding = _mm256_load_si256((const __m256i*) params->avx2.rounding);
 
-    const __m256i vacc0x11335577 = _mm256_shuffle_epi32(vacc0x01234567, _MM_SHUFFLE(3, 3, 1, 1));
-    const __m256i vacc1x11335577 = _mm256_shuffle_epi32(vacc1x01234567, _MM_SHUFFLE(3, 3, 1, 1));
-    const __m256i vacc2x11335577 = _mm256_shuffle_epi32(vacc2x01234567, _MM_SHUFFLE(3, 3, 1, 1));
+    const __m256i vacc0x11335577 = _mm256_srli_epi64(vacc0x01234567, 32);
+    const __m256i vacc1x11335577 = _mm256_srli_epi64(vacc1x01234567, 32);
+    const __m256i vacc2x11335577 = _mm256_srli_epi64(vacc2x01234567, 32);
 
     const __m256i vprod0x0246 = _mm256_add_epi64(_mm256_mul_epi32(vacc0x01234567, vmultiplier), vrounding);
     const __m256i vprod1x0246 = _mm256_add_epi64(_mm256_mul_epi32(vacc1x01234567, vmultiplier), vrounding);
@@ -155,7 +155,7 @@
     const __m256i vq31prod1x01234567 = _mm256_blend_epi16(vq31prod1x0246, vq31prod1x1357, 0xCC);
     const __m256i vq31prod2x01234567 = _mm256_blend_epi16(vq31prod2x0246, vq31prod2x1357, 0xCC);
 
-    const __m256i vremainder_mask = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_mask));
+    const __m256i vremainder_mask = _mm256_load_si256((const __m256i*) params->avx2.remainder_mask);
     const __m256i vrem0x01234567 =
       _mm256_add_epi32(_mm256_and_si256(vq31prod0x01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod0x01234567));
     const __m256i vrem1x01234567 =
@@ -163,8 +163,8 @@
     const __m256i vrem2x01234567 =
       _mm256_add_epi32(_mm256_and_si256(vq31prod2x01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod2x01234567));
 
-    const __m256i vremainder_threshold = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_threshold));
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m256i vremainder_threshold = _mm256_load_si256((const __m256i*) params->avx2.remainder_threshold);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->avx2.shift);
     vacc0x01234567 =
       _mm256_sub_epi32(_mm256_sra_epi32(vq31prod0x01234567, vshift), _mm256_cmpgt_epi32(vrem0x01234567, vremainder_threshold));
     vacc1x01234567 =
@@ -172,19 +172,18 @@
     vacc2x01234567 =
       _mm256_sub_epi32(_mm256_sra_epi32(vq31prod2x01234567, vshift), _mm256_cmpgt_epi32(vrem2x01234567, vremainder_threshold));
 
-    const __m256i voutput_zero_point = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_zero_point));
+    const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx2.output_zero_point);
     __m256i vacc01x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc0x01234567, vacc1x01234567), voutput_zero_point);
     __m256i vacc22x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc2x01234567, vacc2x01234567), voutput_zero_point);
 
     vacc01x01234567 = _mm256_permute4x64_epi64(vacc01x01234567, _MM_SHUFFLE(3, 1, 2, 0));
     vacc22x01234567 = _mm256_permute4x64_epi64(vacc22x01234567, _MM_SHUFFLE(3, 1, 2, 0));
 
-    const __m256i voutput_min = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_min));
-    const __m256i voutput_max = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_max));
-    vacc01x01234567 = _mm256_min_epi16(_mm256_max_epi16(vacc01x01234567, voutput_min), voutput_max);
-    vacc22x01234567 = _mm256_min_epi16(_mm256_max_epi16(vacc22x01234567, voutput_min), voutput_max);
-
     __m256i vout = _mm256_packs_epi16(vacc01x01234567, vacc22x01234567);
+
+    vout = _mm256_max_epi8(vout, _mm256_load_si256((const __m256i*) params->avx2.output_min));
+    vout = _mm256_min_epi8(vout, _mm256_load_si256((const __m256i*) params->avx2.output_max));
+
     __m128i vout_lo = _mm256_castsi256_si128(vout);
     __m128i vout_hi = _mm256_extracti128_si256(vout, 1);
 
diff --git a/src/qs8-gemm/gen/4x16c8-minmax-avx512skx.c b/src/qs8-gemm/gen/4x16c8-minmax-avx512skx.c
index ccc7f19..2a7b6ca 100644
--- a/src/qs8-gemm/gen/4x16c8-minmax-avx512skx.c
+++ b/src/qs8-gemm/gen/4x16c8-minmax-avx512skx.c
@@ -61,14 +61,14 @@
 
   const __mmask16 vbias_mask = _cvtu32_mask16(0x1111);
   const __mmask16 vblend_mask = _cvtu32_mask16(0xAAAA);
-  const __m512i vmultiplier = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse2.multiplier));
-  const __m512i vrounding = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse2.rounding));
-  const __m512i vremainder_mask = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse2.remainder_mask));
-  const __m512i vremainder_threshold = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse2.remainder_threshold));
-  const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
-  const __m512i voutput_zero_point = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse2.output_zero_point));
-  const __m512i voutput_min = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse2.output_min));
-  const __m512i voutput_max = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse2.output_max));
+  const __m512i vmultiplier = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse4.multiplier));
+  const __m512i vrounding = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse4.rounding));
+  const __m512i vremainder_mask = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse4.remainder_mask));
+  const __m512i vremainder_threshold = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse4.remainder_threshold));
+  const __m128i vshift = _mm_load_si128((const __m128i*) params->sse4.shift);
+  const __m512i voutput_zero_point = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse4.output_zero_point));
+  const __m512i voutput_min = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse4.output_min));
+  const __m512i voutput_max = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse4.output_max));
   do {
     __m512i vacc0x0123 = _mm512_maskz_expandloadu_epi32(vbias_mask, w);
     __m512i vacc0x4567 = _mm512_maskz_expandloadu_epi32(vbias_mask, (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)));
@@ -195,15 +195,14 @@
     vacc3x084C195D2A6E3B7F =
       _mm512_mask_sub_epi32(vacc3x084C195D2A6E3B7F, _mm512_cmpgt_epi32_mask(vrem3x084C195D2A6E3B7F, vremainder_threshold), vacc3x084C195D2A6E3B7F, vminus_one);
 
-    __m512i vacc01x084Cx195Dx2A6Ex3B7F = _mm512_adds_epi16(_mm512_packs_epi32(vacc0x084C195D2A6E3B7F, vacc1x084C195D2A6E3B7F), voutput_zero_point);
-    __m512i vacc23x084Cx195Dx2A6Ex3B7F = _mm512_adds_epi16(_mm512_packs_epi32(vacc2x084C195D2A6E3B7F, vacc3x084C195D2A6E3B7F), voutput_zero_point);
-
-    vacc01x084Cx195Dx2A6Ex3B7F = _mm512_min_epi16(_mm512_max_epi16(vacc01x084Cx195Dx2A6Ex3B7F, voutput_min), voutput_max);
-    vacc23x084Cx195Dx2A6Ex3B7F = _mm512_min_epi16(_mm512_max_epi16(vacc23x084Cx195Dx2A6Ex3B7F, voutput_min), voutput_max);
+    const __m512i vacc01x084Cx195Dx2A6Ex3B7F = _mm512_adds_epi16(_mm512_packs_epi32(vacc0x084C195D2A6E3B7F, vacc1x084C195D2A6E3B7F), voutput_zero_point);
+    const __m512i vacc23x084Cx195Dx2A6Ex3B7F = _mm512_adds_epi16(_mm512_packs_epi32(vacc2x084C195D2A6E3B7F, vacc3x084C195D2A6E3B7F), voutput_zero_point);
 
     __m512i vout0123x084Cx195Dx2A6Ex3B7F = _mm512_packs_epi16(vacc01x084Cx195Dx2A6Ex3B7F, vacc23x084Cx195Dx2A6Ex3B7F);
     vout0123x084Cx195Dx2A6Ex3B7F = _mm512_permutexvar_epi32(_mm512_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0), vout0123x084Cx195Dx2A6Ex3B7F);
-    const __m512i vout0123x0123456789ABCDEF = _mm512_shuffle_epi8(vout0123x084Cx195Dx2A6Ex3B7F, _mm512_set_epi8(15, 11, 7, 3, 13, 9, 5, 1, 14, 10, 6, 2, 12, 8, 4, 0, 15, 11, 7, 3, 13, 9, 5, 1, 14, 10, 6, 2, 12, 8, 4, 0, 15, 11, 7, 3, 13, 9, 5, 1, 14, 10, 6, 2, 12, 8, 4, 0, 15, 11, 7, 3, 13, 9, 5, 1, 14, 10, 6, 2, 12, 8, 4, 0));
+    __m512i vout0123x0123456789ABCDEF = _mm512_shuffle_epi8(vout0123x084Cx195Dx2A6Ex3B7F, _mm512_set_epi8(15, 11, 7, 3, 13, 9, 5, 1, 14, 10, 6, 2, 12, 8, 4, 0, 15, 11, 7, 3, 13, 9, 5, 1, 14, 10, 6, 2, 12, 8, 4, 0, 15, 11, 7, 3, 13, 9, 5, 1, 14, 10, 6, 2, 12, 8, 4, 0, 15, 11, 7, 3, 13, 9, 5, 1, 14, 10, 6, 2, 12, 8, 4, 0));
+    vout0123x0123456789ABCDEF = _mm512_max_epi8(vout0123x0123456789ABCDEF, voutput_min);
+    vout0123x0123456789ABCDEF = _mm512_min_epi8(vout0123x0123456789ABCDEF, voutput_max);
 
     if (nc >= 16) {
       _mm_storeu_si128((__m128i*) c0, _mm512_castsi512_si128(vout0123x0123456789ABCDEF));
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-avx-ld128.c b/src/qs8-gemm/gen/4x4c2-minmax-avx-ld128.c
index 69f3237..d3c0a28 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-avx-ld128.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-avx-ld128.c
@@ -186,8 +186,8 @@
       }
     }
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -218,7 +218,7 @@
     const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
     const __m128i vq31prod3x0123 = _mm_blend_epi16(vq31prod3x02, vq31prod3x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
@@ -228,8 +228,8 @@
     const __m128i vrem3x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod3x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod3x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -239,17 +239,16 @@
     vacc3x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod3x0123, vshift), _mm_cmpgt_epi32(vrem3x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
     __m128i vacc23x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc3x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
-    vacc23x0123 = _mm_min_epi16(_mm_max_epi16(vacc23x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc23x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-avx-ld64.c b/src/qs8-gemm/gen/4x4c2-minmax-avx-ld64.c
index 48cec4a..43fdc24 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-avx-ld64.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-avx-ld64.c
@@ -186,8 +186,8 @@
       }
     }
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -218,7 +218,7 @@
     const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
     const __m128i vq31prod3x0123 = _mm_blend_epi16(vq31prod3x02, vq31prod3x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
@@ -228,8 +228,8 @@
     const __m128i vrem3x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod3x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod3x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -239,17 +239,16 @@
     vacc3x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod3x0123, vshift), _mm_cmpgt_epi32(vrem3x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
     __m128i vacc23x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc3x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
-    vacc23x0123 = _mm_min_epi16(_mm_max_epi16(vacc23x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc23x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-sse2-ld128.c b/src/qs8-gemm/gen/4x4c2-minmax-sse2-ld128.c
index b11f3b5..20a69e6 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-sse2-ld128.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-sse2-ld128.c
@@ -269,7 +269,7 @@
       _mm_add_epi32(_mm_and_si128(vq31prod3x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod3x0123));
 
     const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse2.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -290,6 +290,7 @@
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc23x0123);
 
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       vout = _mm_srli_si128(vout, 4);
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-sse2-ld64.c b/src/qs8-gemm/gen/4x4c2-minmax-sse2-ld64.c
index 57d930d..aed4404 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-sse2-ld64.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-sse2-ld64.c
@@ -269,7 +269,7 @@
       _mm_add_epi32(_mm_and_si128(vq31prod3x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod3x0123));
 
     const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse2.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -290,6 +290,7 @@
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc23x0123);
 
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       vout = _mm_srli_si128(vout, 4);
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-sse41-ld128.c b/src/qs8-gemm/gen/4x4c2-minmax-sse41-ld128.c
index f45eed0..31a3cda 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-sse41-ld128.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-sse41-ld128.c
@@ -186,8 +186,8 @@
       }
     }
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -218,7 +218,7 @@
     const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
     const __m128i vq31prod3x0123 = _mm_blend_epi16(vq31prod3x02, vq31prod3x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
@@ -228,8 +228,8 @@
     const __m128i vrem3x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod3x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod3x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -239,17 +239,16 @@
     vacc3x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod3x0123, vshift), _mm_cmpgt_epi32(vrem3x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
     __m128i vacc23x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc3x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
-    vacc23x0123 = _mm_min_epi16(_mm_max_epi16(vacc23x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc23x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-sse41-ld64.c b/src/qs8-gemm/gen/4x4c2-minmax-sse41-ld64.c
index 14330a5..b2cf9d6 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-sse41-ld64.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-sse41-ld64.c
@@ -186,8 +186,8 @@
       }
     }
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -218,7 +218,7 @@
     const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
     const __m128i vq31prod3x0123 = _mm_blend_epi16(vq31prod3x02, vq31prod3x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
@@ -228,8 +228,8 @@
     const __m128i vrem3x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod3x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod3x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -239,17 +239,16 @@
     vacc3x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod3x0123, vshift), _mm_cmpgt_epi32(vrem3x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
     __m128i vacc23x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc3x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
-    vacc23x0123 = _mm_min_epi16(_mm_max_epi16(vacc23x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc23x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-ssse3-ld128.c b/src/qs8-gemm/gen/4x4c2-minmax-ssse3-ld128.c
index 462829e..c1a756e 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-ssse3-ld128.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-ssse3-ld128.c
@@ -269,7 +269,7 @@
       _mm_add_epi32(_mm_and_si128(vq31prod3x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod3x0123));
 
     const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse2.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -290,6 +290,7 @@
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc23x0123);
 
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       vout = _mm_srli_si128(vout, 4);
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-ssse3-ld64.c b/src/qs8-gemm/gen/4x4c2-minmax-ssse3-ld64.c
index 2789fe5..d1b058d 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-ssse3-ld64.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-ssse3-ld64.c
@@ -269,7 +269,7 @@
       _mm_add_epi32(_mm_and_si128(vq31prod3x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod3x0123));
 
     const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse2.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -290,6 +290,7 @@
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc23x0123);
 
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       vout = _mm_srli_si128(vout, 4);
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-xop-ld128.c b/src/qs8-gemm/gen/4x4c2-minmax-xop-ld128.c
index 63f3f41..b1b2b86 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-xop-ld128.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-xop-ld128.c
@@ -191,8 +191,8 @@
       }
     }
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -223,7 +223,7 @@
     const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
     const __m128i vq31prod3x0123 = _mm_blend_epi16(vq31prod3x02, vq31prod3x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
@@ -233,8 +233,8 @@
     const __m128i vrem3x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod3x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod3x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -244,17 +244,16 @@
     vacc3x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod3x0123, vshift), _mm_cmpgt_epi32(vrem3x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
     __m128i vacc23x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc3x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
-    vacc23x0123 = _mm_min_epi16(_mm_max_epi16(vacc23x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc23x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-xop-ld64.c b/src/qs8-gemm/gen/4x4c2-minmax-xop-ld64.c
index 7c30ef9..1bf5524 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-xop-ld64.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-xop-ld64.c
@@ -191,8 +191,8 @@
       }
     }
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -223,7 +223,7 @@
     const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
     const __m128i vq31prod3x0123 = _mm_blend_epi16(vq31prod3x02, vq31prod3x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
@@ -233,8 +233,8 @@
     const __m128i vrem3x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod3x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod3x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -244,17 +244,16 @@
     vacc3x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod3x0123, vshift), _mm_cmpgt_epi32(vrem3x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
     __m128i vacc23x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc3x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
-    vacc23x0123 = _mm_min_epi16(_mm_max_epi16(vacc23x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc23x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
diff --git a/src/qs8-gemm/gen/4x4c2-xw-minmax-avx.c b/src/qs8-gemm/gen/4x4c2-xw-minmax-avx.c
index ae7c5a9..8179aeb 100644
--- a/src/qs8-gemm/gen/4x4c2-xw-minmax-avx.c
+++ b/src/qs8-gemm/gen/4x4c2-xw-minmax-avx.c
@@ -179,8 +179,8 @@
       }
     }
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -211,7 +211,7 @@
     const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
     const __m128i vq31prod3x0123 = _mm_blend_epi16(vq31prod3x02, vq31prod3x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
@@ -221,8 +221,8 @@
     const __m128i vrem3x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod3x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod3x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -232,17 +232,16 @@
     vacc3x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod3x0123, vshift), _mm_cmpgt_epi32(vrem3x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
     __m128i vacc23x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc3x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
-    vacc23x0123 = _mm_min_epi16(_mm_max_epi16(vacc23x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc23x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
diff --git a/src/qs8-gemm/gen/4x4c2-xw-minmax-sse2.c b/src/qs8-gemm/gen/4x4c2-xw-minmax-sse2.c
index 9f90a03..60593d2 100644
--- a/src/qs8-gemm/gen/4x4c2-xw-minmax-sse2.c
+++ b/src/qs8-gemm/gen/4x4c2-xw-minmax-sse2.c
@@ -262,7 +262,7 @@
       _mm_add_epi32(_mm_and_si128(vq31prod3x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod3x0123));
 
     const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse2.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -283,6 +283,7 @@
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc23x0123);
 
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       vout = _mm_srli_si128(vout, 4);
diff --git a/src/qs8-gemm/gen/4x4c2-xw-minmax-sse41.c b/src/qs8-gemm/gen/4x4c2-xw-minmax-sse41.c
index 383b1a5..ca5a399 100644
--- a/src/qs8-gemm/gen/4x4c2-xw-minmax-sse41.c
+++ b/src/qs8-gemm/gen/4x4c2-xw-minmax-sse41.c
@@ -179,8 +179,8 @@
       }
     }
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -211,7 +211,7 @@
     const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
     const __m128i vq31prod3x0123 = _mm_blend_epi16(vq31prod3x02, vq31prod3x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
@@ -221,8 +221,8 @@
     const __m128i vrem3x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod3x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod3x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -232,17 +232,16 @@
     vacc3x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod3x0123, vshift), _mm_cmpgt_epi32(vrem3x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
     __m128i vacc23x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc3x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
-    vacc23x0123 = _mm_min_epi16(_mm_max_epi16(vacc23x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc23x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
diff --git a/src/qs8-gemm/gen/4x4c2-xw-minmax-ssse3.c b/src/qs8-gemm/gen/4x4c2-xw-minmax-ssse3.c
index bdb5950..a2cd7d0 100644
--- a/src/qs8-gemm/gen/4x4c2-xw-minmax-ssse3.c
+++ b/src/qs8-gemm/gen/4x4c2-xw-minmax-ssse3.c
@@ -262,7 +262,7 @@
       _mm_add_epi32(_mm_and_si128(vq31prod3x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod3x0123));
 
     const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse2.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -283,6 +283,7 @@
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc23x0123);
 
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       vout = _mm_srli_si128(vout, 4);
diff --git a/src/qs8-gemm/gen/4x4c2-xw-minmax-xop.c b/src/qs8-gemm/gen/4x4c2-xw-minmax-xop.c
index c80140a..699237d 100644
--- a/src/qs8-gemm/gen/4x4c2-xw-minmax-xop.c
+++ b/src/qs8-gemm/gen/4x4c2-xw-minmax-xop.c
@@ -184,8 +184,8 @@
       }
     }
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -216,7 +216,7 @@
     const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
     const __m128i vq31prod3x0123 = _mm_blend_epi16(vq31prod3x02, vq31prod3x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
@@ -226,8 +226,8 @@
     const __m128i vrem3x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod3x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod3x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -237,17 +237,16 @@
     vacc3x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod3x0123, vshift), _mm_cmpgt_epi32(vrem3x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
     __m128i vacc23x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc3x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
-    vacc23x0123 = _mm_min_epi16(_mm_max_epi16(vacc23x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc23x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
diff --git a/src/qs8-igemm/MRx16c8-avx512skx.c.in b/src/qs8-igemm/MRx16c8-avx512skx.c.in
index 55595b7..4eddff2 100644
--- a/src/qs8-igemm/MRx16c8-avx512skx.c.in
+++ b/src/qs8-igemm/MRx16c8-avx512skx.c.in
@@ -64,13 +64,18 @@
   const __m512i vremainder_threshold = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse2.remainder_threshold));
   const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
   $if MR > 1:
-    const __m512i voutput_zero_point = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse2.output_zero_point));
-    const __m512i voutput_min = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse2.output_min));
-    const __m512i voutput_max = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse2.output_max));
+    const __m512i voutput_zero_point = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse4.output_zero_point));
   $else:
-    const __m256i voutput_zero_point = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_zero_point));
-    const __m256i voutput_min = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_min));
-    const __m256i voutput_max = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_max));
+    const __m256i voutput_zero_point = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse4.output_zero_point));
+  $if MR > 2:
+    const __m512i voutput_min = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse4.output_min));
+    const __m512i voutput_max = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse4.output_max));
+  $elif MR == 2:
+    const __m256i voutput_min = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse4.output_min));
+    const __m256i voutput_max = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse4.output_max));
+  $else:
+    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4.output_min);
+    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse4.output_max);
   do {
     __m512i vacc0x0123 = _mm512_maskz_expandloadu_epi32(vbias_mask, w);
     $for N in range(4, 16, 4):
@@ -155,26 +160,28 @@
         _mm512_mask_sub_epi32(vacc${M}x084C195D2A6E3B7F, _mm512_cmpgt_epi32_mask(vrem${M}x084C195D2A6E3B7F, vremainder_threshold), vacc${M}x084C195D2A6E3B7F, vminus_one);
 
     $if MR == 1:
-      __m256i vacc0x084C2A6E195D3B7F = _mm256_adds_epi16(_mm256_packs_epi32(_mm512_castsi512_si256(vacc0x084C195D2A6E3B7F), _mm512_extracti32x8_epi32(vacc0x084C195D2A6E3B7F, 1)), voutput_zero_point);
-      vacc0x084C2A6E195D3B7F = _mm256_min_epi16(_mm256_max_epi16(vacc0x084C2A6E195D3B7F, voutput_min), voutput_max);
+      const __m256i vacc0x084C2A6E195D3B7F = _mm256_adds_epi16(_mm256_packs_epi32(_mm512_castsi512_si256(vacc0x084C195D2A6E3B7F), _mm512_extracti32x8_epi32(vacc0x084C195D2A6E3B7F, 1)), voutput_zero_point);
     $else:
       $for M in range(0, MR, 2):
-        __m512i vacc${M}${min(M+1, MR-1)}x084Cx195Dx2A6Ex3B7F = _mm512_adds_epi16(_mm512_packs_epi32(vacc${M}x084C195D2A6E3B7F, vacc${min(M+1, MR-1)}x084C195D2A6E3B7F), voutput_zero_point);
-
-      $for M in range(0, MR, 2):
-        vacc${M}${min(M+1, MR-1)}x084Cx195Dx2A6Ex3B7F = _mm512_min_epi16(_mm512_max_epi16(vacc${M}${min(M+1, MR-1)}x084Cx195Dx2A6Ex3B7F, voutput_min), voutput_max);
+        const __m512i vacc${M}${min(M+1, MR-1)}x084Cx195Dx2A6Ex3B7F = _mm512_adds_epi16(_mm512_packs_epi32(vacc${M}x084C195D2A6E3B7F, vacc${min(M+1, MR-1)}x084C195D2A6E3B7F), voutput_zero_point);
 
     $if MR > 2:
       __m512i vout012${min(3, MR-1)}x084Cx195Dx2A6Ex3B7F = _mm512_packs_epi16(vacc01x084Cx195Dx2A6Ex3B7F, vacc2${min(3, MR-1)}x084Cx195Dx2A6Ex3B7F);
       vout012${min(M+3, MR-1)}x084Cx195Dx2A6Ex3B7F = _mm512_permutexvar_epi32(_mm512_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0), vout012${min(3, MR-1)}x084Cx195Dx2A6Ex3B7F);
-      const __m512i vout012${min(3, MR-1)}x0123456789ABCDEF = _mm512_shuffle_epi8(vout012${min(3, MR-1)}x084Cx195Dx2A6Ex3B7F, _mm512_set_epi8(15, 11, 7, 3, 13, 9, 5, 1, 14, 10, 6, 2, 12, 8, 4, 0, 15, 11, 7, 3, 13, 9, 5, 1, 14, 10, 6, 2, 12, 8, 4, 0, 15, 11, 7, 3, 13, 9, 5, 1, 14, 10, 6, 2, 12, 8, 4, 0, 15, 11, 7, 3, 13, 9, 5, 1, 14, 10, 6, 2, 12, 8, 4, 0));
+      __m512i vout012${min(3, MR-1)}x0123456789ABCDEF = _mm512_shuffle_epi8(vout012${min(3, MR-1)}x084Cx195Dx2A6Ex3B7F, _mm512_set_epi8(15, 11, 7, 3, 13, 9, 5, 1, 14, 10, 6, 2, 12, 8, 4, 0, 15, 11, 7, 3, 13, 9, 5, 1, 14, 10, 6, 2, 12, 8, 4, 0, 15, 11, 7, 3, 13, 9, 5, 1, 14, 10, 6, 2, 12, 8, 4, 0, 15, 11, 7, 3, 13, 9, 5, 1, 14, 10, 6, 2, 12, 8, 4, 0));
+      vout012${min(3, MR-1)}x0123456789ABCDEF = _mm512_max_epi8(vout012${min(3, MR-1)}x0123456789ABCDEF, voutput_min);
+      vout012${min(3, MR-1)}x0123456789ABCDEF = _mm512_min_epi8(vout012${min(3, MR-1)}x0123456789ABCDEF, voutput_max);
     $elif MR == 2:
       const __m256i vout01x084Cx2A6Ex195Dx3B7F = _mm256_packs_epi16(_mm512_castsi512_si256(vacc01x084Cx195Dx2A6Ex3B7F), _mm512_extracti32x8_epi32(vacc01x084Cx195Dx2A6Ex3B7F, 1));
       const __m256i vout01x084C2A6E195D3B7F = _mm256_permutevar8x32_epi32(vout01x084Cx2A6Ex195Dx3B7F, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
-      const __m256i vout01x0123456789ABCDEF = _mm256_shuffle_epi8(vout01x084C2A6E195D3B7F, _mm256_set_epi8(15, 7, 11, 3, 13, 5, 9, 1, 14, 6, 10, 2, 12, 4, 8, 0, 15, 7, 11, 3, 13, 5, 9, 1, 14, 6, 10, 2, 12, 4, 8, 0));
+      __m256i vout01x0123456789ABCDEF = _mm256_shuffle_epi8(vout01x084C2A6E195D3B7F, _mm256_set_epi8(15, 7, 11, 3, 13, 5, 9, 1, 14, 6, 10, 2, 12, 4, 8, 0, 15, 7, 11, 3, 13, 5, 9, 1, 14, 6, 10, 2, 12, 4, 8, 0));
+      vout01x0123456789ABCDEF = _mm256_max_epi8(vout01x0123456789ABCDEF, voutput_min);
+      vout01x0123456789ABCDEF = _mm256_min_epi8(vout01x0123456789ABCDEF, voutput_max);
     $elif MR == 1:
       const __m128i vout0x084C2A6E195D3B7F = _mm_packs_epi16(_mm256_castsi256_si128(vacc0x084C2A6E195D3B7F), _mm256_extracti128_si256(vacc0x084C2A6E195D3B7F, 1));
-      const __m128i vout0x0123456789ABCDEF = _mm_shuffle_epi8(vout0x084C2A6E195D3B7F, _mm_set_epi8(15, 7, 11, 3, 13, 5, 9, 1, 14, 6, 10, 2, 12, 4, 8, 0));
+      __m128i vout0x0123456789ABCDEF = _mm_shuffle_epi8(vout0x084C2A6E195D3B7F, _mm_set_epi8(15, 7, 11, 3, 13, 5, 9, 1, 14, 6, 10, 2, 12, 4, 8, 0));
+      vout0x0123456789ABCDEF = _mm_max_epi8(vout0x0123456789ABCDEF, voutput_min);
+      vout0x0123456789ABCDEF = _mm_min_epi8(vout0x0123456789ABCDEF, voutput_max);
 
     $if MR > 2:
       if (nc >= 16) {
diff --git a/src/qs8-igemm/MRx4c2-sse.c.in b/src/qs8-igemm/MRx4c2-sse.c.in
index e3cc0df..ded0cde 100644
--- a/src/qs8-igemm/MRx4c2-sse.c.in
+++ b/src/qs8-igemm/MRx4c2-sse.c.in
@@ -25,6 +25,7 @@
 #include <xnnpack/math.h>
 
 
+$PARAMS_STRUCT = "sse4" if SSE >= 4 else "sse2"
 $ISA = "xop" if XOP else "avx" if AVX else {2: "sse2", 3: "ssse3", 4: "sse41"}[SSE]
 void xnn_qs8_igemm_minmax_ukernel_${MR}x4c2__${ISA}_${VARIANT.lower()}(
     size_t mr,
@@ -192,8 +193,8 @@
       p -= ${MR} * sizeof(void*);
     } while (p != 0);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.rounding);
 
     $if SSE == 4:
       $for M in range(MR):
@@ -255,31 +256,39 @@
       $for M in range(MR):
         const __m128i vq31prod${M}x0123 = _mm_shuffle_epi32(vq31prod${M}x0213, _MM_SHUFFLE(3, 1, 2, 0));
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.remainder_mask);
     $for M in range(MR):
       const __m128i vrem${M}x0123 =
         _mm_add_epi32(_mm_and_si128(vq31prod${M}x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod${M}x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.remainder_threshold);
+    $if M > 1:
+      const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->${PARAMS_STRUCT}.shift);
+    $else:
+      const __m128i vshift = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.shift);
     $for M in range(MR):
       vacc${M}x0123 =
         _mm_sub_epi32(_mm_sra_epi32(vq31prod${M}x0123, vshift), _mm_cmpgt_epi32(vrem${M}x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_zero_point);
     $for M in range(0, MR, 2):
       __m128i vacc${M}${min(M+1, MR-1)}x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc${M}x0123, vacc${min(M+1, MR-1)}x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    $for M in range(0, MR, 2):
-      vacc${M}${min(M+1, MR-1)}x0123 = _mm_min_epi16(_mm_max_epi16(vacc${M}${min(M+1, MR-1)}x0123, voutput_min), voutput_max);
+    $if SSE < 4:
+      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
+      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
+      $for M in range(0, MR, 2):
+        vacc${M}${min(M+1, MR-1)}x0123 = _mm_min_epi16(_mm_max_epi16(vacc${M}${min(M+1, MR-1)}x0123, voutput_min), voutput_max);
 
     $if MR > 2:
       __m128i vout = _mm_packs_epi16(vacc0${min(1, MR-1)}x0123, vacc${min(2, MR-1)}${min(3, MR-1)}x0123);
     $else:
       __m128i vout = _mm_packs_epi16(vacc0${min(1, MR-1)}x0123, vacc0${min(1, MR-1)}x0123);
 
+    $if SSE == 4:
+      vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+      vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       $for M in reversed(range(1, MR)):
         $if SSE == 4:
diff --git a/src/qs8-igemm/MRx4c8-sse.c.in b/src/qs8-igemm/MRx4c8-sse.c.in
index 0b4b236..2357c61 100644
--- a/src/qs8-igemm/MRx4c8-sse.c.in
+++ b/src/qs8-igemm/MRx4c8-sse.c.in
@@ -25,6 +25,7 @@
 #include <xnnpack/math.h>
 
 
+$PARAMS_STRUCT = "sse4" if SSE >= 4 else "sse2"
 $ISA = "xop" if XOP else "avx" if AVX else {2: "sse2", 3: "ssse3", 4: "sse41"}[SSE]
 void xnn_qs8_igemm_minmax_ukernel_${MR}x4c8__${ISA}_${VARIANT.lower()}(
     size_t mr,
@@ -150,8 +151,8 @@
       $for M in range(MR):
         __m128i vacc${M}x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc${M}x02, vacc${M}x13), _mm_unpackhi_epi32(vacc${M}x02, vacc${M}x13));
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.rounding);
 
     $if SSE == 4:
       $for M in range(MR):
@@ -213,31 +214,39 @@
       $for M in range(MR):
         const __m128i vq31prod${M}x0123 = _mm_shuffle_epi32(vq31prod${M}x0213, _MM_SHUFFLE(3, 1, 2, 0));
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.remainder_mask);
     $for M in range(MR):
       const __m128i vrem${M}x0123 =
         _mm_add_epi32(_mm_and_si128(vq31prod${M}x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod${M}x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.remainder_threshold);
+    $if M > 1:
+      const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->${PARAMS_STRUCT}.shift);
+    $else:
+      const __m128i vshift = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.shift);
     $for M in range(MR):
       vacc${M}x0123 =
         _mm_sub_epi32(_mm_sra_epi32(vq31prod${M}x0123, vshift), _mm_cmpgt_epi32(vrem${M}x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_zero_point);
     $for M in range(0, MR, 2):
       __m128i vacc${M}${min(M+1, MR-1)}x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc${M}x0123, vacc${min(M+1, MR-1)}x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    $for M in range(0, MR, 2):
-      vacc${M}${min(M+1, MR-1)}x0123 = _mm_min_epi16(_mm_max_epi16(vacc${M}${min(M+1, MR-1)}x0123, voutput_min), voutput_max);
+    $if SSE < 4:
+      const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
+      const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
+      $for M in range(0, MR, 2):
+        vacc${M}${min(M+1, MR-1)}x0123 = _mm_min_epi16(_mm_max_epi16(vacc${M}${min(M+1, MR-1)}x0123, voutput_min), voutput_max);
 
     $if MR > 2:
       __m128i vout = _mm_packs_epi16(vacc0${min(1, MR-1)}x0123, vacc${min(2, MR-1)}${min(3, MR-1)}x0123);
     $else:
       __m128i vout = _mm_packs_epi16(vacc0${min(1, MR-1)}x0123, vacc0${min(1, MR-1)}x0123);
 
+    $if SSE == 4:
+      vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+      vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       $for M in reversed(range(1, MR)):
         $if SSE == 4:
diff --git a/src/qs8-igemm/MRx8c8-avx2.c.in b/src/qs8-igemm/MRx8c8-avx2.c.in
index 582f70c..34df8bb 100644
--- a/src/qs8-igemm/MRx8c8-avx2.c.in
+++ b/src/qs8-igemm/MRx8c8-avx2.c.in
@@ -111,8 +111,8 @@
     $for M in range(MR):
       __m256i vacc${M}x01234567 = _mm256_permutevar8x32_epi32(vacc${M}x02461357, vpermute_mask);
 
-    const __m256i vmultiplier = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.multiplier));
-    const __m256i vrounding = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.rounding));
+    const __m256i vmultiplier = _mm256_load_si256((const __m256i*) params->avx2.multiplier);
+    const __m256i vrounding = _mm256_load_si256((const __m256i*) params->avx2.rounding);
 
     $for M in range(MR):
       const __m256i vacc${M}x11335577 = _mm256_shuffle_epi32(vacc${M}x01234567, _MM_SHUFFLE(3, 3, 1, 1));
@@ -130,33 +130,35 @@
     $for M in range(MR):
       const __m256i vq31prod${M}x01234567 = _mm256_blend_epi16(vq31prod${M}x0246, vq31prod${M}x1357, 0xCC);
 
-    const __m256i vremainder_mask = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_mask));
+    const __m256i vremainder_mask = _mm256_load_si256((const __m256i*) params->avx2.remainder_mask);
     $for M in range(MR):
       const __m256i vrem${M}x01234567 =
         _mm256_add_epi32(_mm256_and_si256(vq31prod${M}x01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod${M}x01234567));
 
-    const __m256i vremainder_threshold = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_threshold));
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m256i vremainder_threshold = _mm256_load_si256((const __m256i*) params->avx2.remainder_threshold);
+    $if M > 1:
+      const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->avx2.shift);
+    $else:
+      const __m128i vshift = _mm_load_si128((const __m128i*) params->avx2.shift);
     $for M in range(MR):
       vacc${M}x01234567 =
         _mm256_sub_epi32(_mm256_sra_epi32(vq31prod${M}x01234567, vshift), _mm256_cmpgt_epi32(vrem${M}x01234567, vremainder_threshold));
 
-    const __m256i voutput_zero_point = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_zero_point));
+    const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx2.output_zero_point);
     $for M in range(0, MR, 2):
       __m256i vacc${M}${min(M+1, MR-1)}x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc${M}x01234567, vacc${min(M+1, MR-1)}x01234567), voutput_zero_point);
 
     $for M in range(0, MR, 2):
       vacc${M}${min(M+1, MR-1)}x01234567 = _mm256_permute4x64_epi64(vacc${M}${min(M+1, MR-1)}x01234567, _MM_SHUFFLE(3, 1, 2, 0));
 
-    const __m256i voutput_min = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_min));
-    const __m256i voutput_max = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_max));
-    $for M in range(0, MR, 2):
-      vacc${M}${min(M+1, MR-1)}x01234567 = _mm256_min_epi16(_mm256_max_epi16(vacc${M}${min(M+1, MR-1)}x01234567, voutput_min), voutput_max);
-
     $if MR > 2:
       __m256i vout = _mm256_packs_epi16(vacc0${min(1, MR-1)}x01234567, vacc${min(2, MR-1)}${min(3, MR-1)}x01234567);
     $else:
       __m256i vout = _mm256_packs_epi16(vacc0${min(1, MR-1)}x01234567, vacc0${min(1, MR-1)}x01234567);
+
+    vout = _mm256_max_epi8(vout, _mm256_load_si256((const __m256i*) params->avx2.output_min));
+    vout = _mm256_min_epi8(vout, _mm256_load_si256((const __m256i*) params->avx2.output_max));
+
     __m128i vout_lo = _mm256_castsi256_si128(vout);
     __m128i vout_hi = _mm256_extracti128_si256(vout, 1);
 
diff --git a/src/qs8-igemm/gen/1x16c8-minmax-avx512skx.c b/src/qs8-igemm/gen/1x16c8-minmax-avx512skx.c
index 6e860c2..fd29a93 100644
--- a/src/qs8-igemm/gen/1x16c8-minmax-avx512skx.c
+++ b/src/qs8-igemm/gen/1x16c8-minmax-avx512skx.c
@@ -49,9 +49,9 @@
   const __m512i vremainder_mask = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse2.remainder_mask));
   const __m512i vremainder_threshold = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse2.remainder_threshold));
   const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
-  const __m256i voutput_zero_point = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_zero_point));
-  const __m256i voutput_min = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_min));
-  const __m256i voutput_max = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_max));
+  const __m256i voutput_zero_point = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse4.output_zero_point));
+  const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4.output_min);
+  const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse4.output_max);
   do {
     __m512i vacc0x0123 = _mm512_maskz_expandloadu_epi32(vbias_mask, w);
     __m512i vacc0x4567 = _mm512_maskz_expandloadu_epi32(vbias_mask, (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)));
@@ -116,11 +116,12 @@
     vacc0x084C195D2A6E3B7F =
       _mm512_mask_sub_epi32(vacc0x084C195D2A6E3B7F, _mm512_cmpgt_epi32_mask(vrem0x084C195D2A6E3B7F, vremainder_threshold), vacc0x084C195D2A6E3B7F, vminus_one);
 
-    __m256i vacc0x084C2A6E195D3B7F = _mm256_adds_epi16(_mm256_packs_epi32(_mm512_castsi512_si256(vacc0x084C195D2A6E3B7F), _mm512_extracti32x8_epi32(vacc0x084C195D2A6E3B7F, 1)), voutput_zero_point);
-    vacc0x084C2A6E195D3B7F = _mm256_min_epi16(_mm256_max_epi16(vacc0x084C2A6E195D3B7F, voutput_min), voutput_max);
+    const __m256i vacc0x084C2A6E195D3B7F = _mm256_adds_epi16(_mm256_packs_epi32(_mm512_castsi512_si256(vacc0x084C195D2A6E3B7F), _mm512_extracti32x8_epi32(vacc0x084C195D2A6E3B7F, 1)), voutput_zero_point);
 
     const __m128i vout0x084C2A6E195D3B7F = _mm_packs_epi16(_mm256_castsi256_si128(vacc0x084C2A6E195D3B7F), _mm256_extracti128_si256(vacc0x084C2A6E195D3B7F, 1));
-    const __m128i vout0x0123456789ABCDEF = _mm_shuffle_epi8(vout0x084C2A6E195D3B7F, _mm_set_epi8(15, 7, 11, 3, 13, 5, 9, 1, 14, 6, 10, 2, 12, 4, 8, 0));
+    __m128i vout0x0123456789ABCDEF = _mm_shuffle_epi8(vout0x084C2A6E195D3B7F, _mm_set_epi8(15, 7, 11, 3, 13, 5, 9, 1, 14, 6, 10, 2, 12, 4, 8, 0));
+    vout0x0123456789ABCDEF = _mm_max_epi8(vout0x0123456789ABCDEF, voutput_min);
+    vout0x0123456789ABCDEF = _mm_min_epi8(vout0x0123456789ABCDEF, voutput_max);
 
     if (nc >= 16) {
       _mm_storeu_si128((__m128i*) c0, vout0x0123456789ABCDEF);
diff --git a/src/qs8-igemm/gen/1x4c2-minmax-avx-ld128.c b/src/qs8-igemm/gen/1x4c2-minmax-avx-ld128.c
index 2443e03..f309d27 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-avx-ld128.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-avx-ld128.c
@@ -118,8 +118,8 @@
       p -= 1 * sizeof(void*);
     } while (p != 0);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
 
@@ -132,24 +132,24 @@
 
     const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc00x0123 = _mm_min_epi16(_mm_max_epi16(vacc00x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
diff --git a/src/qs8-igemm/gen/1x4c2-minmax-avx-ld64.c b/src/qs8-igemm/gen/1x4c2-minmax-avx-ld64.c
index b846272..852be0e 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-avx-ld64.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-avx-ld64.c
@@ -118,8 +118,8 @@
       p -= 1 * sizeof(void*);
     } while (p != 0);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
 
@@ -132,24 +132,24 @@
 
     const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc00x0123 = _mm_min_epi16(_mm_max_epi16(vacc00x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
diff --git a/src/qs8-igemm/gen/1x4c2-minmax-sse2-ld128.c b/src/qs8-igemm/gen/1x4c2-minmax-sse2-ld128.c
index 8bcb718..74c720c 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-sse2-ld128.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-sse2-ld128.c
@@ -166,6 +166,7 @@
 
     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
 
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
diff --git a/src/qs8-igemm/gen/1x4c2-minmax-sse2-ld64.c b/src/qs8-igemm/gen/1x4c2-minmax-sse2-ld64.c
index 8909f0d..12f9ea0 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-sse2-ld64.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-sse2-ld64.c
@@ -166,6 +166,7 @@
 
     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
 
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
diff --git a/src/qs8-igemm/gen/1x4c2-minmax-sse41-ld128.c b/src/qs8-igemm/gen/1x4c2-minmax-sse41-ld128.c
index dc9dbeb..f3a9c3a 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-sse41-ld128.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-sse41-ld128.c
@@ -118,8 +118,8 @@
       p -= 1 * sizeof(void*);
     } while (p != 0);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
 
@@ -132,24 +132,24 @@
 
     const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc00x0123 = _mm_min_epi16(_mm_max_epi16(vacc00x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
diff --git a/src/qs8-igemm/gen/1x4c2-minmax-sse41-ld64.c b/src/qs8-igemm/gen/1x4c2-minmax-sse41-ld64.c
index cc8372c..4e9e5f9 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-sse41-ld64.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-sse41-ld64.c
@@ -118,8 +118,8 @@
       p -= 1 * sizeof(void*);
     } while (p != 0);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
 
@@ -132,24 +132,24 @@
 
     const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc00x0123 = _mm_min_epi16(_mm_max_epi16(vacc00x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
diff --git a/src/qs8-igemm/gen/1x4c2-minmax-ssse3-ld128.c b/src/qs8-igemm/gen/1x4c2-minmax-ssse3-ld128.c
index 5f22e7c..4380fab 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-ssse3-ld128.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-ssse3-ld128.c
@@ -166,6 +166,7 @@
 
     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
 
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
diff --git a/src/qs8-igemm/gen/1x4c2-minmax-ssse3-ld64.c b/src/qs8-igemm/gen/1x4c2-minmax-ssse3-ld64.c
index 7c923f1..655c345 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-ssse3-ld64.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-ssse3-ld64.c
@@ -166,6 +166,7 @@
 
     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
 
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
diff --git a/src/qs8-igemm/gen/1x4c2-minmax-xop-ld128.c b/src/qs8-igemm/gen/1x4c2-minmax-xop-ld128.c
index 37cbccf..95a2775 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-xop-ld128.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-xop-ld128.c
@@ -123,8 +123,8 @@
       p -= 1 * sizeof(void*);
     } while (p != 0);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
 
@@ -137,24 +137,24 @@
 
     const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc00x0123 = _mm_min_epi16(_mm_max_epi16(vacc00x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
diff --git a/src/qs8-igemm/gen/1x4c2-minmax-xop-ld64.c b/src/qs8-igemm/gen/1x4c2-minmax-xop-ld64.c
index 81f421d..841e214 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-xop-ld64.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-xop-ld64.c
@@ -123,8 +123,8 @@
       p -= 1 * sizeof(void*);
     } while (p != 0);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
 
@@ -137,24 +137,24 @@
 
     const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc00x0123 = _mm_min_epi16(_mm_max_epi16(vacc00x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-avx-ld128.c b/src/qs8-igemm/gen/1x4c8-minmax-avx-ld128.c
index 2d2d988..58d01b6 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-avx-ld128.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-avx-ld128.c
@@ -90,8 +90,8 @@
 
     __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
 
@@ -104,24 +104,24 @@
 
     const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc00x0123 = _mm_min_epi16(_mm_max_epi16(vacc00x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-avx-ld64.c b/src/qs8-igemm/gen/1x4c8-minmax-avx-ld64.c
index 9283765..d2dd638 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-avx-ld64.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-avx-ld64.c
@@ -92,8 +92,8 @@
 
     __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
 
@@ -106,24 +106,24 @@
 
     const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc00x0123 = _mm_min_epi16(_mm_max_epi16(vacc00x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-sse2-ld128.c b/src/qs8-igemm/gen/1x4c8-minmax-sse2-ld128.c
index 1b225a1..1793705 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-sse2-ld128.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-sse2-ld128.c
@@ -138,6 +138,7 @@
 
     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
 
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-sse2-ld64.c b/src/qs8-igemm/gen/1x4c8-minmax-sse2-ld64.c
index 0700e32..5d1fdae 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-sse2-ld64.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-sse2-ld64.c
@@ -140,6 +140,7 @@
 
     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
 
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-sse41-ld128.c b/src/qs8-igemm/gen/1x4c8-minmax-sse41-ld128.c
index 0ba0ed2..eaffaa6 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-sse41-ld128.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-sse41-ld128.c
@@ -90,8 +90,8 @@
 
     __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
 
@@ -104,24 +104,24 @@
 
     const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc00x0123 = _mm_min_epi16(_mm_max_epi16(vacc00x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-sse41-ld64.c b/src/qs8-igemm/gen/1x4c8-minmax-sse41-ld64.c
index 5f96910..ad2344f 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-sse41-ld64.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-sse41-ld64.c
@@ -92,8 +92,8 @@
 
     __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
 
@@ -106,24 +106,24 @@
 
     const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc00x0123 = _mm_min_epi16(_mm_max_epi16(vacc00x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-ssse3-ld128.c b/src/qs8-igemm/gen/1x4c8-minmax-ssse3-ld128.c
index 0c0454f..da9a1c1 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-ssse3-ld128.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-ssse3-ld128.c
@@ -138,6 +138,7 @@
 
     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
 
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-ssse3-ld64.c b/src/qs8-igemm/gen/1x4c8-minmax-ssse3-ld64.c
index 5de63b6..520ba62 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-ssse3-ld64.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-ssse3-ld64.c
@@ -140,6 +140,7 @@
 
     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
 
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-xop-ld128.c b/src/qs8-igemm/gen/1x4c8-minmax-xop-ld128.c
index 1c95c42..b0646d5 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-xop-ld128.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-xop-ld128.c
@@ -95,8 +95,8 @@
 
     __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
 
@@ -109,24 +109,24 @@
 
     const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc00x0123 = _mm_min_epi16(_mm_max_epi16(vacc00x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-xop-ld64.c b/src/qs8-igemm/gen/1x4c8-minmax-xop-ld64.c
index aa6bbd1..7403cf2 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-xop-ld64.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-xop-ld64.c
@@ -97,8 +97,8 @@
 
     __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
 
@@ -111,24 +111,24 @@
 
     const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc00x0123 = _mm_min_epi16(_mm_max_epi16(vacc00x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
diff --git a/src/qs8-igemm/gen/1x8c8-minmax-avx2.c b/src/qs8-igemm/gen/1x8c8-minmax-avx2.c
index 4372a66..b988ae6 100644
--- a/src/qs8-igemm/gen/1x8c8-minmax-avx2.c
+++ b/src/qs8-igemm/gen/1x8c8-minmax-avx2.c
@@ -104,8 +104,8 @@
     const __m256i vpermute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
     __m256i vacc0x01234567 = _mm256_permutevar8x32_epi32(vacc0x02461357, vpermute_mask);
 
-    const __m256i vmultiplier = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.multiplier));
-    const __m256i vrounding = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.rounding));
+    const __m256i vmultiplier = _mm256_load_si256((const __m256i*) params->avx2.multiplier);
+    const __m256i vrounding = _mm256_load_si256((const __m256i*) params->avx2.rounding);
 
     const __m256i vacc0x11335577 = _mm256_shuffle_epi32(vacc0x01234567, _MM_SHUFFLE(3, 3, 1, 1));
 
@@ -118,25 +118,25 @@
 
     const __m256i vq31prod0x01234567 = _mm256_blend_epi16(vq31prod0x0246, vq31prod0x1357, 0xCC);
 
-    const __m256i vremainder_mask = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_mask));
+    const __m256i vremainder_mask = _mm256_load_si256((const __m256i*) params->avx2.remainder_mask);
     const __m256i vrem0x01234567 =
       _mm256_add_epi32(_mm256_and_si256(vq31prod0x01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod0x01234567));
 
-    const __m256i vremainder_threshold = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_threshold));
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m256i vremainder_threshold = _mm256_load_si256((const __m256i*) params->avx2.remainder_threshold);
+    const __m128i vshift = _mm_load_si128((const __m128i*) params->avx2.shift);
     vacc0x01234567 =
       _mm256_sub_epi32(_mm256_sra_epi32(vq31prod0x01234567, vshift), _mm256_cmpgt_epi32(vrem0x01234567, vremainder_threshold));
 
-    const __m256i voutput_zero_point = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_zero_point));
+    const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx2.output_zero_point);
     __m256i vacc00x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc0x01234567, vacc0x01234567), voutput_zero_point);
 
     vacc00x01234567 = _mm256_permute4x64_epi64(vacc00x01234567, _MM_SHUFFLE(3, 1, 2, 0));
 
-    const __m256i voutput_min = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_min));
-    const __m256i voutput_max = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_max));
-    vacc00x01234567 = _mm256_min_epi16(_mm256_max_epi16(vacc00x01234567, voutput_min), voutput_max);
-
     __m256i vout = _mm256_packs_epi16(vacc00x01234567, vacc00x01234567);
+
+    vout = _mm256_max_epi8(vout, _mm256_load_si256((const __m256i*) params->avx2.output_min));
+    vout = _mm256_min_epi8(vout, _mm256_load_si256((const __m256i*) params->avx2.output_max));
+
     __m128i vout_lo = _mm256_castsi256_si128(vout);
     __m128i vout_hi = _mm256_extracti128_si256(vout, 1);
 
diff --git a/src/qs8-igemm/gen/2x16c8-minmax-avx512skx.c b/src/qs8-igemm/gen/2x16c8-minmax-avx512skx.c
index af495ea..1a3b682 100644
--- a/src/qs8-igemm/gen/2x16c8-minmax-avx512skx.c
+++ b/src/qs8-igemm/gen/2x16c8-minmax-avx512skx.c
@@ -53,9 +53,9 @@
   const __m512i vremainder_mask = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse2.remainder_mask));
   const __m512i vremainder_threshold = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse2.remainder_threshold));
   const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
-  const __m512i voutput_zero_point = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse2.output_zero_point));
-  const __m512i voutput_min = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse2.output_min));
-  const __m512i voutput_max = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse2.output_max));
+  const __m512i voutput_zero_point = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse4.output_zero_point));
+  const __m256i voutput_min = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse4.output_min));
+  const __m256i voutput_max = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse4.output_max));
   do {
     __m512i vacc0x0123 = _mm512_maskz_expandloadu_epi32(vbias_mask, w);
     __m512i vacc0x4567 = _mm512_maskz_expandloadu_epi32(vbias_mask, (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)));
@@ -148,13 +148,13 @@
     vacc1x084C195D2A6E3B7F =
       _mm512_mask_sub_epi32(vacc1x084C195D2A6E3B7F, _mm512_cmpgt_epi32_mask(vrem1x084C195D2A6E3B7F, vremainder_threshold), vacc1x084C195D2A6E3B7F, vminus_one);
 
-    __m512i vacc01x084Cx195Dx2A6Ex3B7F = _mm512_adds_epi16(_mm512_packs_epi32(vacc0x084C195D2A6E3B7F, vacc1x084C195D2A6E3B7F), voutput_zero_point);
-
-    vacc01x084Cx195Dx2A6Ex3B7F = _mm512_min_epi16(_mm512_max_epi16(vacc01x084Cx195Dx2A6Ex3B7F, voutput_min), voutput_max);
+    const __m512i vacc01x084Cx195Dx2A6Ex3B7F = _mm512_adds_epi16(_mm512_packs_epi32(vacc0x084C195D2A6E3B7F, vacc1x084C195D2A6E3B7F), voutput_zero_point);
 
     const __m256i vout01x084Cx2A6Ex195Dx3B7F = _mm256_packs_epi16(_mm512_castsi512_si256(vacc01x084Cx195Dx2A6Ex3B7F), _mm512_extracti32x8_epi32(vacc01x084Cx195Dx2A6Ex3B7F, 1));
     const __m256i vout01x084C2A6E195D3B7F = _mm256_permutevar8x32_epi32(vout01x084Cx2A6Ex195Dx3B7F, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
-    const __m256i vout01x0123456789ABCDEF = _mm256_shuffle_epi8(vout01x084C2A6E195D3B7F, _mm256_set_epi8(15, 7, 11, 3, 13, 5, 9, 1, 14, 6, 10, 2, 12, 4, 8, 0, 15, 7, 11, 3, 13, 5, 9, 1, 14, 6, 10, 2, 12, 4, 8, 0));
+    __m256i vout01x0123456789ABCDEF = _mm256_shuffle_epi8(vout01x084C2A6E195D3B7F, _mm256_set_epi8(15, 7, 11, 3, 13, 5, 9, 1, 14, 6, 10, 2, 12, 4, 8, 0, 15, 7, 11, 3, 13, 5, 9, 1, 14, 6, 10, 2, 12, 4, 8, 0));
+    vout01x0123456789ABCDEF = _mm256_max_epi8(vout01x0123456789ABCDEF, voutput_min);
+    vout01x0123456789ABCDEF = _mm256_min_epi8(vout01x0123456789ABCDEF, voutput_max);
 
     if (nc >= 16) {
       _mm_storeu_si128((__m128i*) c1, _mm256_extracti128_si256(vout01x0123456789ABCDEF, 1));
diff --git a/src/qs8-igemm/gen/2x4c2-minmax-avx-ld128.c b/src/qs8-igemm/gen/2x4c2-minmax-avx-ld128.c
index 6c2f64a..cc08b92 100644
--- a/src/qs8-igemm/gen/2x4c2-minmax-avx-ld128.c
+++ b/src/qs8-igemm/gen/2x4c2-minmax-avx-ld128.c
@@ -147,8 +147,8 @@
       p -= 2 * sizeof(void*);
     } while (p != 0);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -167,28 +167,28 @@
     const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
     const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
diff --git a/src/qs8-igemm/gen/2x4c2-minmax-avx-ld64.c b/src/qs8-igemm/gen/2x4c2-minmax-avx-ld64.c
index 92c3ee4..21823a3 100644
--- a/src/qs8-igemm/gen/2x4c2-minmax-avx-ld64.c
+++ b/src/qs8-igemm/gen/2x4c2-minmax-avx-ld64.c
@@ -147,8 +147,8 @@
       p -= 2 * sizeof(void*);
     } while (p != 0);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -167,28 +167,28 @@
     const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
     const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
diff --git a/src/qs8-igemm/gen/2x4c2-minmax-sse2-ld128.c b/src/qs8-igemm/gen/2x4c2-minmax-sse2-ld128.c
index 2ae8eba..89f3698 100644
--- a/src/qs8-igemm/gen/2x4c2-minmax-sse2-ld128.c
+++ b/src/qs8-igemm/gen/2x4c2-minmax-sse2-ld128.c
@@ -213,6 +213,7 @@
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
 
+
     if (nc >= 4) {
       *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(_mm_shuffle_epi32(vout, _MM_SHUFFLE(1, 1, 1, 1)));
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
diff --git a/src/qs8-igemm/gen/2x4c2-minmax-sse2-ld64.c b/src/qs8-igemm/gen/2x4c2-minmax-sse2-ld64.c
index c1ccebd..d59bf28 100644
--- a/src/qs8-igemm/gen/2x4c2-minmax-sse2-ld64.c
+++ b/src/qs8-igemm/gen/2x4c2-minmax-sse2-ld64.c
@@ -213,6 +213,7 @@
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
 
+
     if (nc >= 4) {
       *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(_mm_shuffle_epi32(vout, _MM_SHUFFLE(1, 1, 1, 1)));
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
diff --git a/src/qs8-igemm/gen/2x4c2-minmax-sse41-ld128.c b/src/qs8-igemm/gen/2x4c2-minmax-sse41-ld128.c
index 10a96a7..b852014 100644
--- a/src/qs8-igemm/gen/2x4c2-minmax-sse41-ld128.c
+++ b/src/qs8-igemm/gen/2x4c2-minmax-sse41-ld128.c
@@ -147,8 +147,8 @@
       p -= 2 * sizeof(void*);
     } while (p != 0);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -167,28 +167,28 @@
     const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
     const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
diff --git a/src/qs8-igemm/gen/2x4c2-minmax-sse41-ld64.c b/src/qs8-igemm/gen/2x4c2-minmax-sse41-ld64.c
index b50953e..4532c35 100644
--- a/src/qs8-igemm/gen/2x4c2-minmax-sse41-ld64.c
+++ b/src/qs8-igemm/gen/2x4c2-minmax-sse41-ld64.c
@@ -147,8 +147,8 @@
       p -= 2 * sizeof(void*);
     } while (p != 0);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -167,28 +167,28 @@
     const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
     const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
diff --git a/src/qs8-igemm/gen/2x4c2-minmax-ssse3-ld128.c b/src/qs8-igemm/gen/2x4c2-minmax-ssse3-ld128.c
index 0b72444..10fc1e4 100644
--- a/src/qs8-igemm/gen/2x4c2-minmax-ssse3-ld128.c
+++ b/src/qs8-igemm/gen/2x4c2-minmax-ssse3-ld128.c
@@ -213,6 +213,7 @@
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
 
+
     if (nc >= 4) {
       *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(_mm_shuffle_epi32(vout, _MM_SHUFFLE(1, 1, 1, 1)));
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
diff --git a/src/qs8-igemm/gen/2x4c2-minmax-ssse3-ld64.c b/src/qs8-igemm/gen/2x4c2-minmax-ssse3-ld64.c
index 29c6aeb..175955b 100644
--- a/src/qs8-igemm/gen/2x4c2-minmax-ssse3-ld64.c
+++ b/src/qs8-igemm/gen/2x4c2-minmax-ssse3-ld64.c
@@ -213,6 +213,7 @@
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
 
+
     if (nc >= 4) {
       *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(_mm_shuffle_epi32(vout, _MM_SHUFFLE(1, 1, 1, 1)));
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
diff --git a/src/qs8-igemm/gen/2x4c2-minmax-xop-ld128.c b/src/qs8-igemm/gen/2x4c2-minmax-xop-ld128.c
index 4cbf968..71b9cab 100644
--- a/src/qs8-igemm/gen/2x4c2-minmax-xop-ld128.c
+++ b/src/qs8-igemm/gen/2x4c2-minmax-xop-ld128.c
@@ -152,8 +152,8 @@
       p -= 2 * sizeof(void*);
     } while (p != 0);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -172,28 +172,28 @@
     const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
     const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
diff --git a/src/qs8-igemm/gen/2x4c2-minmax-xop-ld64.c b/src/qs8-igemm/gen/2x4c2-minmax-xop-ld64.c
index 2f53629..88649df 100644
--- a/src/qs8-igemm/gen/2x4c2-minmax-xop-ld64.c
+++ b/src/qs8-igemm/gen/2x4c2-minmax-xop-ld64.c
@@ -152,8 +152,8 @@
       p -= 2 * sizeof(void*);
     } while (p != 0);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -172,28 +172,28 @@
     const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
     const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-avx-ld128.c b/src/qs8-igemm/gen/2x4c8-minmax-avx-ld128.c
index 05fed74..9307dc8 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-avx-ld128.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-avx-ld128.c
@@ -112,8 +112,8 @@
     __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
     __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -132,28 +132,28 @@
     const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
     const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-avx-ld64.c b/src/qs8-igemm/gen/2x4c8-minmax-avx-ld64.c
index 774b901..b92f417 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-avx-ld64.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-avx-ld64.c
@@ -114,8 +114,8 @@
     __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
     __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -134,28 +134,28 @@
     const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
     const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-sse2-ld128.c b/src/qs8-igemm/gen/2x4c8-minmax-sse2-ld128.c
index 6444d15..9304588 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-sse2-ld128.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-sse2-ld128.c
@@ -178,6 +178,7 @@
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
 
+
     if (nc >= 4) {
       *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(_mm_shuffle_epi32(vout, _MM_SHUFFLE(1, 1, 1, 1)));
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-sse2-ld64.c b/src/qs8-igemm/gen/2x4c8-minmax-sse2-ld64.c
index 5f948d5..c6773a8 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-sse2-ld64.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-sse2-ld64.c
@@ -180,6 +180,7 @@
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
 
+
     if (nc >= 4) {
       *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(_mm_shuffle_epi32(vout, _MM_SHUFFLE(1, 1, 1, 1)));
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-sse41-ld128.c b/src/qs8-igemm/gen/2x4c8-minmax-sse41-ld128.c
index daaacf7..0c1f2ab 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-sse41-ld128.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-sse41-ld128.c
@@ -112,8 +112,8 @@
     __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
     __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -132,28 +132,28 @@
     const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
     const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-sse41-ld64.c b/src/qs8-igemm/gen/2x4c8-minmax-sse41-ld64.c
index 4ad6652..96af574 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-sse41-ld64.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-sse41-ld64.c
@@ -114,8 +114,8 @@
     __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
     __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -134,28 +134,28 @@
     const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
     const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-ssse3-ld128.c b/src/qs8-igemm/gen/2x4c8-minmax-ssse3-ld128.c
index 99bb004..95a3f59 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-ssse3-ld128.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-ssse3-ld128.c
@@ -178,6 +178,7 @@
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
 
+
     if (nc >= 4) {
       *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(_mm_shuffle_epi32(vout, _MM_SHUFFLE(1, 1, 1, 1)));
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-ssse3-ld64.c b/src/qs8-igemm/gen/2x4c8-minmax-ssse3-ld64.c
index 971df2a..ec11e48 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-ssse3-ld64.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-ssse3-ld64.c
@@ -180,6 +180,7 @@
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
 
+
     if (nc >= 4) {
       *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(_mm_shuffle_epi32(vout, _MM_SHUFFLE(1, 1, 1, 1)));
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-xop-ld128.c b/src/qs8-igemm/gen/2x4c8-minmax-xop-ld128.c
index ed3cc5c..bd3c5a7 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-xop-ld128.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-xop-ld128.c
@@ -117,8 +117,8 @@
     __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
     __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -137,28 +137,28 @@
     const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
     const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-xop-ld64.c b/src/qs8-igemm/gen/2x4c8-minmax-xop-ld64.c
index 2032a31..bb0a7c2 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-xop-ld64.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-xop-ld64.c
@@ -119,8 +119,8 @@
     __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
     __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -139,28 +139,28 @@
     const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
     const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
diff --git a/src/qs8-igemm/gen/2x8c8-minmax-avx2.c b/src/qs8-igemm/gen/2x8c8-minmax-avx2.c
index a82458a..b33bfa9 100644
--- a/src/qs8-igemm/gen/2x8c8-minmax-avx2.c
+++ b/src/qs8-igemm/gen/2x8c8-minmax-avx2.c
@@ -127,8 +127,8 @@
     __m256i vacc0x01234567 = _mm256_permutevar8x32_epi32(vacc0x02461357, vpermute_mask);
     __m256i vacc1x01234567 = _mm256_permutevar8x32_epi32(vacc1x02461357, vpermute_mask);
 
-    const __m256i vmultiplier = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.multiplier));
-    const __m256i vrounding = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.rounding));
+    const __m256i vmultiplier = _mm256_load_si256((const __m256i*) params->avx2.multiplier);
+    const __m256i vrounding = _mm256_load_si256((const __m256i*) params->avx2.rounding);
 
     const __m256i vacc0x11335577 = _mm256_shuffle_epi32(vacc0x01234567, _MM_SHUFFLE(3, 3, 1, 1));
     const __m256i vacc1x11335577 = _mm256_shuffle_epi32(vacc1x01234567, _MM_SHUFFLE(3, 3, 1, 1));
@@ -147,29 +147,29 @@
     const __m256i vq31prod0x01234567 = _mm256_blend_epi16(vq31prod0x0246, vq31prod0x1357, 0xCC);
     const __m256i vq31prod1x01234567 = _mm256_blend_epi16(vq31prod1x0246, vq31prod1x1357, 0xCC);
 
-    const __m256i vremainder_mask = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_mask));
+    const __m256i vremainder_mask = _mm256_load_si256((const __m256i*) params->avx2.remainder_mask);
     const __m256i vrem0x01234567 =
       _mm256_add_epi32(_mm256_and_si256(vq31prod0x01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod0x01234567));
     const __m256i vrem1x01234567 =
       _mm256_add_epi32(_mm256_and_si256(vq31prod1x01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod1x01234567));
 
-    const __m256i vremainder_threshold = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_threshold));
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m256i vremainder_threshold = _mm256_load_si256((const __m256i*) params->avx2.remainder_threshold);
+    const __m128i vshift = _mm_load_si128((const __m128i*) params->avx2.shift);
     vacc0x01234567 =
       _mm256_sub_epi32(_mm256_sra_epi32(vq31prod0x01234567, vshift), _mm256_cmpgt_epi32(vrem0x01234567, vremainder_threshold));
     vacc1x01234567 =
       _mm256_sub_epi32(_mm256_sra_epi32(vq31prod1x01234567, vshift), _mm256_cmpgt_epi32(vrem1x01234567, vremainder_threshold));
 
-    const __m256i voutput_zero_point = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_zero_point));
+    const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx2.output_zero_point);
     __m256i vacc01x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc0x01234567, vacc1x01234567), voutput_zero_point);
 
     vacc01x01234567 = _mm256_permute4x64_epi64(vacc01x01234567, _MM_SHUFFLE(3, 1, 2, 0));
 
-    const __m256i voutput_min = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_min));
-    const __m256i voutput_max = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_max));
-    vacc01x01234567 = _mm256_min_epi16(_mm256_max_epi16(vacc01x01234567, voutput_min), voutput_max);
-
     __m256i vout = _mm256_packs_epi16(vacc01x01234567, vacc01x01234567);
+
+    vout = _mm256_max_epi8(vout, _mm256_load_si256((const __m256i*) params->avx2.output_min));
+    vout = _mm256_min_epi8(vout, _mm256_load_si256((const __m256i*) params->avx2.output_max));
+
     __m128i vout_lo = _mm256_castsi256_si128(vout);
     __m128i vout_hi = _mm256_extracti128_si256(vout, 1);
 
diff --git a/src/qs8-igemm/gen/3x16c8-minmax-avx512skx.c b/src/qs8-igemm/gen/3x16c8-minmax-avx512skx.c
index 722d4f3..73d32f0 100644
--- a/src/qs8-igemm/gen/3x16c8-minmax-avx512skx.c
+++ b/src/qs8-igemm/gen/3x16c8-minmax-avx512skx.c
@@ -57,9 +57,9 @@
   const __m512i vremainder_mask = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse2.remainder_mask));
   const __m512i vremainder_threshold = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse2.remainder_threshold));
   const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
-  const __m512i voutput_zero_point = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse2.output_zero_point));
-  const __m512i voutput_min = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse2.output_min));
-  const __m512i voutput_max = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse2.output_max));
+  const __m512i voutput_zero_point = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse4.output_zero_point));
+  const __m512i voutput_min = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse4.output_min));
+  const __m512i voutput_max = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse4.output_max));
   do {
     __m512i vacc0x0123 = _mm512_maskz_expandloadu_epi32(vbias_mask, w);
     __m512i vacc0x4567 = _mm512_maskz_expandloadu_epi32(vbias_mask, (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)));
@@ -180,15 +180,14 @@
     vacc2x084C195D2A6E3B7F =
       _mm512_mask_sub_epi32(vacc2x084C195D2A6E3B7F, _mm512_cmpgt_epi32_mask(vrem2x084C195D2A6E3B7F, vremainder_threshold), vacc2x084C195D2A6E3B7F, vminus_one);
 
-    __m512i vacc01x084Cx195Dx2A6Ex3B7F = _mm512_adds_epi16(_mm512_packs_epi32(vacc0x084C195D2A6E3B7F, vacc1x084C195D2A6E3B7F), voutput_zero_point);
-    __m512i vacc22x084Cx195Dx2A6Ex3B7F = _mm512_adds_epi16(_mm512_packs_epi32(vacc2x084C195D2A6E3B7F, vacc2x084C195D2A6E3B7F), voutput_zero_point);
-
-    vacc01x084Cx195Dx2A6Ex3B7F = _mm512_min_epi16(_mm512_max_epi16(vacc01x084Cx195Dx2A6Ex3B7F, voutput_min), voutput_max);
-    vacc22x084Cx195Dx2A6Ex3B7F = _mm512_min_epi16(_mm512_max_epi16(vacc22x084Cx195Dx2A6Ex3B7F, voutput_min), voutput_max);
+    const __m512i vacc01x084Cx195Dx2A6Ex3B7F = _mm512_adds_epi16(_mm512_packs_epi32(vacc0x084C195D2A6E3B7F, vacc1x084C195D2A6E3B7F), voutput_zero_point);
+    const __m512i vacc22x084Cx195Dx2A6Ex3B7F = _mm512_adds_epi16(_mm512_packs_epi32(vacc2x084C195D2A6E3B7F, vacc2x084C195D2A6E3B7F), voutput_zero_point);
 
     __m512i vout0122x084Cx195Dx2A6Ex3B7F = _mm512_packs_epi16(vacc01x084Cx195Dx2A6Ex3B7F, vacc22x084Cx195Dx2A6Ex3B7F);
     vout0122x084Cx195Dx2A6Ex3B7F = _mm512_permutexvar_epi32(_mm512_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0), vout0122x084Cx195Dx2A6Ex3B7F);
-    const __m512i vout0122x0123456789ABCDEF = _mm512_shuffle_epi8(vout0122x084Cx195Dx2A6Ex3B7F, _mm512_set_epi8(15, 11, 7, 3, 13, 9, 5, 1, 14, 10, 6, 2, 12, 8, 4, 0, 15, 11, 7, 3, 13, 9, 5, 1, 14, 10, 6, 2, 12, 8, 4, 0, 15, 11, 7, 3, 13, 9, 5, 1, 14, 10, 6, 2, 12, 8, 4, 0, 15, 11, 7, 3, 13, 9, 5, 1, 14, 10, 6, 2, 12, 8, 4, 0));
+    __m512i vout0122x0123456789ABCDEF = _mm512_shuffle_epi8(vout0122x084Cx195Dx2A6Ex3B7F, _mm512_set_epi8(15, 11, 7, 3, 13, 9, 5, 1, 14, 10, 6, 2, 12, 8, 4, 0, 15, 11, 7, 3, 13, 9, 5, 1, 14, 10, 6, 2, 12, 8, 4, 0, 15, 11, 7, 3, 13, 9, 5, 1, 14, 10, 6, 2, 12, 8, 4, 0, 15, 11, 7, 3, 13, 9, 5, 1, 14, 10, 6, 2, 12, 8, 4, 0));
+    vout0122x0123456789ABCDEF = _mm512_max_epi8(vout0122x0123456789ABCDEF, voutput_min);
+    vout0122x0123456789ABCDEF = _mm512_min_epi8(vout0122x0123456789ABCDEF, voutput_max);
 
     if (nc >= 16) {
       _mm_storeu_si128((__m128i*) c2, _mm512_extracti32x4_epi32(vout0122x0123456789ABCDEF, 2));
diff --git a/src/qs8-igemm/gen/3x4c2-minmax-avx-ld128.c b/src/qs8-igemm/gen/3x4c2-minmax-avx-ld128.c
index 94a8a7c..07170b4 100644
--- a/src/qs8-igemm/gen/3x4c2-minmax-avx-ld128.c
+++ b/src/qs8-igemm/gen/3x4c2-minmax-avx-ld128.c
@@ -176,8 +176,8 @@
       p -= 3 * sizeof(void*);
     } while (p != 0);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -202,7 +202,7 @@
     const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
     const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
@@ -210,8 +210,8 @@
     const __m128i vrem2x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -219,17 +219,16 @@
     vacc2x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod2x0123, vshift), _mm_cmpgt_epi32(vrem2x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
     __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
-    vacc22x0123 = _mm_min_epi16(_mm_max_epi16(vacc22x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
diff --git a/src/qs8-igemm/gen/3x4c2-minmax-avx-ld64.c b/src/qs8-igemm/gen/3x4c2-minmax-avx-ld64.c
index 2f1a22d..a6224db 100644
--- a/src/qs8-igemm/gen/3x4c2-minmax-avx-ld64.c
+++ b/src/qs8-igemm/gen/3x4c2-minmax-avx-ld64.c
@@ -176,8 +176,8 @@
       p -= 3 * sizeof(void*);
     } while (p != 0);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -202,7 +202,7 @@
     const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
     const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
@@ -210,8 +210,8 @@
     const __m128i vrem2x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -219,17 +219,16 @@
     vacc2x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod2x0123, vshift), _mm_cmpgt_epi32(vrem2x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
     __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
-    vacc22x0123 = _mm_min_epi16(_mm_max_epi16(vacc22x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
diff --git a/src/qs8-igemm/gen/3x4c2-minmax-sse2-ld128.c b/src/qs8-igemm/gen/3x4c2-minmax-sse2-ld128.c
index 5933dd2..991e596 100644
--- a/src/qs8-igemm/gen/3x4c2-minmax-sse2-ld128.c
+++ b/src/qs8-igemm/gen/3x4c2-minmax-sse2-ld128.c
@@ -243,7 +243,7 @@
       _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
 
     const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse2.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -262,6 +262,7 @@
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
 
+
     if (nc >= 4) {
       *((uint32_t*) c2) = (uint32_t) _mm_cvtsi128_si32(_mm_shuffle_epi32(vout, _MM_SHUFFLE(2, 2, 2, 2)));
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
diff --git a/src/qs8-igemm/gen/3x4c2-minmax-sse2-ld64.c b/src/qs8-igemm/gen/3x4c2-minmax-sse2-ld64.c
index 0e380ee..1dff2c6 100644
--- a/src/qs8-igemm/gen/3x4c2-minmax-sse2-ld64.c
+++ b/src/qs8-igemm/gen/3x4c2-minmax-sse2-ld64.c
@@ -243,7 +243,7 @@
       _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
 
     const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse2.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -262,6 +262,7 @@
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
 
+
     if (nc >= 4) {
       *((uint32_t*) c2) = (uint32_t) _mm_cvtsi128_si32(_mm_shuffle_epi32(vout, _MM_SHUFFLE(2, 2, 2, 2)));
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
diff --git a/src/qs8-igemm/gen/3x4c2-minmax-sse41-ld128.c b/src/qs8-igemm/gen/3x4c2-minmax-sse41-ld128.c
index b441fae..e9ac047 100644
--- a/src/qs8-igemm/gen/3x4c2-minmax-sse41-ld128.c
+++ b/src/qs8-igemm/gen/3x4c2-minmax-sse41-ld128.c
@@ -176,8 +176,8 @@
       p -= 3 * sizeof(void*);
     } while (p != 0);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -202,7 +202,7 @@
     const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
     const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
@@ -210,8 +210,8 @@
     const __m128i vrem2x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -219,17 +219,16 @@
     vacc2x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod2x0123, vshift), _mm_cmpgt_epi32(vrem2x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
     __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
-    vacc22x0123 = _mm_min_epi16(_mm_max_epi16(vacc22x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
diff --git a/src/qs8-igemm/gen/3x4c2-minmax-sse41-ld64.c b/src/qs8-igemm/gen/3x4c2-minmax-sse41-ld64.c
index 6223c65..cf64636 100644
--- a/src/qs8-igemm/gen/3x4c2-minmax-sse41-ld64.c
+++ b/src/qs8-igemm/gen/3x4c2-minmax-sse41-ld64.c
@@ -176,8 +176,8 @@
       p -= 3 * sizeof(void*);
     } while (p != 0);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -202,7 +202,7 @@
     const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
     const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
@@ -210,8 +210,8 @@
     const __m128i vrem2x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -219,17 +219,16 @@
     vacc2x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod2x0123, vshift), _mm_cmpgt_epi32(vrem2x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
     __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
-    vacc22x0123 = _mm_min_epi16(_mm_max_epi16(vacc22x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
diff --git a/src/qs8-igemm/gen/3x4c2-minmax-ssse3-ld128.c b/src/qs8-igemm/gen/3x4c2-minmax-ssse3-ld128.c
index 0b2a7bc..fedb5a0 100644
--- a/src/qs8-igemm/gen/3x4c2-minmax-ssse3-ld128.c
+++ b/src/qs8-igemm/gen/3x4c2-minmax-ssse3-ld128.c
@@ -243,7 +243,7 @@
       _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
 
     const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse2.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -262,6 +262,7 @@
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
 
+
     if (nc >= 4) {
       *((uint32_t*) c2) = (uint32_t) _mm_cvtsi128_si32(_mm_shuffle_epi32(vout, _MM_SHUFFLE(2, 2, 2, 2)));
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
diff --git a/src/qs8-igemm/gen/3x4c2-minmax-ssse3-ld64.c b/src/qs8-igemm/gen/3x4c2-minmax-ssse3-ld64.c
index ab0c741..0a105fb 100644
--- a/src/qs8-igemm/gen/3x4c2-minmax-ssse3-ld64.c
+++ b/src/qs8-igemm/gen/3x4c2-minmax-ssse3-ld64.c
@@ -243,7 +243,7 @@
       _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
 
     const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse2.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -262,6 +262,7 @@
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
 
+
     if (nc >= 4) {
       *((uint32_t*) c2) = (uint32_t) _mm_cvtsi128_si32(_mm_shuffle_epi32(vout, _MM_SHUFFLE(2, 2, 2, 2)));
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
diff --git a/src/qs8-igemm/gen/3x4c2-minmax-xop-ld128.c b/src/qs8-igemm/gen/3x4c2-minmax-xop-ld128.c
index 1ffbfee..6fd187b 100644
--- a/src/qs8-igemm/gen/3x4c2-minmax-xop-ld128.c
+++ b/src/qs8-igemm/gen/3x4c2-minmax-xop-ld128.c
@@ -181,8 +181,8 @@
       p -= 3 * sizeof(void*);
     } while (p != 0);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -207,7 +207,7 @@
     const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
     const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
@@ -215,8 +215,8 @@
     const __m128i vrem2x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -224,17 +224,16 @@
     vacc2x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod2x0123, vshift), _mm_cmpgt_epi32(vrem2x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
     __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
-    vacc22x0123 = _mm_min_epi16(_mm_max_epi16(vacc22x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
diff --git a/src/qs8-igemm/gen/3x4c2-minmax-xop-ld64.c b/src/qs8-igemm/gen/3x4c2-minmax-xop-ld64.c
index 30a0e81..c0a46f4 100644
--- a/src/qs8-igemm/gen/3x4c2-minmax-xop-ld64.c
+++ b/src/qs8-igemm/gen/3x4c2-minmax-xop-ld64.c
@@ -181,8 +181,8 @@
       p -= 3 * sizeof(void*);
     } while (p != 0);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -207,7 +207,7 @@
     const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
     const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
@@ -215,8 +215,8 @@
     const __m128i vrem2x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -224,17 +224,16 @@
     vacc2x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod2x0123, vshift), _mm_cmpgt_epi32(vrem2x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
     __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
-    vacc22x0123 = _mm_min_epi16(_mm_max_epi16(vacc22x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
diff --git a/src/qs8-igemm/gen/3x4c8-minmax-avx-ld128.c b/src/qs8-igemm/gen/3x4c8-minmax-avx-ld128.c
index ee09dd0..7d5fff1 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-avx-ld128.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-avx-ld128.c
@@ -134,8 +134,8 @@
     __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
     __m128i vacc2x0123 = _mm_hadd_epi32(vacc2x01, vacc2x23);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -160,7 +160,7 @@
     const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
     const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
@@ -168,8 +168,8 @@
     const __m128i vrem2x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -177,17 +177,16 @@
     vacc2x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod2x0123, vshift), _mm_cmpgt_epi32(vrem2x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
     __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
-    vacc22x0123 = _mm_min_epi16(_mm_max_epi16(vacc22x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
diff --git a/src/qs8-igemm/gen/3x4c8-minmax-avx-ld64.c b/src/qs8-igemm/gen/3x4c8-minmax-avx-ld64.c
index dae0701..330eaa0 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-avx-ld64.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-avx-ld64.c
@@ -136,8 +136,8 @@
     __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
     __m128i vacc2x0123 = _mm_hadd_epi32(vacc2x01, vacc2x23);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -162,7 +162,7 @@
     const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
     const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
@@ -170,8 +170,8 @@
     const __m128i vrem2x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -179,17 +179,16 @@
     vacc2x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod2x0123, vshift), _mm_cmpgt_epi32(vrem2x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
     __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
-    vacc22x0123 = _mm_min_epi16(_mm_max_epi16(vacc22x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
diff --git a/src/qs8-igemm/gen/3x4c8-minmax-sse2-ld128.c b/src/qs8-igemm/gen/3x4c8-minmax-sse2-ld128.c
index c0d9941..0022e00 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-sse2-ld128.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-sse2-ld128.c
@@ -201,7 +201,7 @@
       _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
 
     const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse2.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -220,6 +220,7 @@
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
 
+
     if (nc >= 4) {
       *((uint32_t*) c2) = (uint32_t) _mm_cvtsi128_si32(_mm_shuffle_epi32(vout, _MM_SHUFFLE(2, 2, 2, 2)));
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
diff --git a/src/qs8-igemm/gen/3x4c8-minmax-sse2-ld64.c b/src/qs8-igemm/gen/3x4c8-minmax-sse2-ld64.c
index aaf3649..83d4334 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-sse2-ld64.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-sse2-ld64.c
@@ -203,7 +203,7 @@
       _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
 
     const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse2.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -222,6 +222,7 @@
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
 
+
     if (nc >= 4) {
       *((uint32_t*) c2) = (uint32_t) _mm_cvtsi128_si32(_mm_shuffle_epi32(vout, _MM_SHUFFLE(2, 2, 2, 2)));
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
diff --git a/src/qs8-igemm/gen/3x4c8-minmax-sse41-ld128.c b/src/qs8-igemm/gen/3x4c8-minmax-sse41-ld128.c
index 92c755b..a39c3ab 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-sse41-ld128.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-sse41-ld128.c
@@ -134,8 +134,8 @@
     __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
     __m128i vacc2x0123 = _mm_hadd_epi32(vacc2x01, vacc2x23);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -160,7 +160,7 @@
     const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
     const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
@@ -168,8 +168,8 @@
     const __m128i vrem2x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -177,17 +177,16 @@
     vacc2x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod2x0123, vshift), _mm_cmpgt_epi32(vrem2x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
     __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
-    vacc22x0123 = _mm_min_epi16(_mm_max_epi16(vacc22x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
diff --git a/src/qs8-igemm/gen/3x4c8-minmax-sse41-ld64.c b/src/qs8-igemm/gen/3x4c8-minmax-sse41-ld64.c
index 069b8aa..82eec9e 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-sse41-ld64.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-sse41-ld64.c
@@ -136,8 +136,8 @@
     __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
     __m128i vacc2x0123 = _mm_hadd_epi32(vacc2x01, vacc2x23);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -162,7 +162,7 @@
     const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
     const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
@@ -170,8 +170,8 @@
     const __m128i vrem2x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -179,17 +179,16 @@
     vacc2x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod2x0123, vshift), _mm_cmpgt_epi32(vrem2x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
     __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
-    vacc22x0123 = _mm_min_epi16(_mm_max_epi16(vacc22x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
diff --git a/src/qs8-igemm/gen/3x4c8-minmax-ssse3-ld128.c b/src/qs8-igemm/gen/3x4c8-minmax-ssse3-ld128.c
index 69adc9b..0a35711 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-ssse3-ld128.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-ssse3-ld128.c
@@ -201,7 +201,7 @@
       _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
 
     const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse2.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -220,6 +220,7 @@
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
 
+
     if (nc >= 4) {
       *((uint32_t*) c2) = (uint32_t) _mm_cvtsi128_si32(_mm_shuffle_epi32(vout, _MM_SHUFFLE(2, 2, 2, 2)));
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
diff --git a/src/qs8-igemm/gen/3x4c8-minmax-ssse3-ld64.c b/src/qs8-igemm/gen/3x4c8-minmax-ssse3-ld64.c
index 6afce4a..27e040c 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-ssse3-ld64.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-ssse3-ld64.c
@@ -203,7 +203,7 @@
       _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
 
     const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse2.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -222,6 +222,7 @@
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
 
+
     if (nc >= 4) {
       *((uint32_t*) c2) = (uint32_t) _mm_cvtsi128_si32(_mm_shuffle_epi32(vout, _MM_SHUFFLE(2, 2, 2, 2)));
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
diff --git a/src/qs8-igemm/gen/3x4c8-minmax-xop-ld128.c b/src/qs8-igemm/gen/3x4c8-minmax-xop-ld128.c
index 0ffb9cc..96d42c0 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-xop-ld128.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-xop-ld128.c
@@ -139,8 +139,8 @@
     __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
     __m128i vacc2x0123 = _mm_hadd_epi32(vacc2x01, vacc2x23);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -165,7 +165,7 @@
     const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
     const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
@@ -173,8 +173,8 @@
     const __m128i vrem2x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -182,17 +182,16 @@
     vacc2x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod2x0123, vshift), _mm_cmpgt_epi32(vrem2x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
     __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
-    vacc22x0123 = _mm_min_epi16(_mm_max_epi16(vacc22x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
diff --git a/src/qs8-igemm/gen/3x4c8-minmax-xop-ld64.c b/src/qs8-igemm/gen/3x4c8-minmax-xop-ld64.c
index b7d59b6..f311e3c 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-xop-ld64.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-xop-ld64.c
@@ -141,8 +141,8 @@
     __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
     __m128i vacc2x0123 = _mm_hadd_epi32(vacc2x01, vacc2x23);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -167,7 +167,7 @@
     const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
     const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
@@ -175,8 +175,8 @@
     const __m128i vrem2x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -184,17 +184,16 @@
     vacc2x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod2x0123, vshift), _mm_cmpgt_epi32(vrem2x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
     __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
-    vacc22x0123 = _mm_min_epi16(_mm_max_epi16(vacc22x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
diff --git a/src/qs8-igemm/gen/3x8c8-minmax-avx2.c b/src/qs8-igemm/gen/3x8c8-minmax-avx2.c
index 202bb77..2055125 100644
--- a/src/qs8-igemm/gen/3x8c8-minmax-avx2.c
+++ b/src/qs8-igemm/gen/3x8c8-minmax-avx2.c
@@ -150,8 +150,8 @@
     __m256i vacc1x01234567 = _mm256_permutevar8x32_epi32(vacc1x02461357, vpermute_mask);
     __m256i vacc2x01234567 = _mm256_permutevar8x32_epi32(vacc2x02461357, vpermute_mask);
 
-    const __m256i vmultiplier = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.multiplier));
-    const __m256i vrounding = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.rounding));
+    const __m256i vmultiplier = _mm256_load_si256((const __m256i*) params->avx2.multiplier);
+    const __m256i vrounding = _mm256_load_si256((const __m256i*) params->avx2.rounding);
 
     const __m256i vacc0x11335577 = _mm256_shuffle_epi32(vacc0x01234567, _MM_SHUFFLE(3, 3, 1, 1));
     const __m256i vacc1x11335577 = _mm256_shuffle_epi32(vacc1x01234567, _MM_SHUFFLE(3, 3, 1, 1));
@@ -176,7 +176,7 @@
     const __m256i vq31prod1x01234567 = _mm256_blend_epi16(vq31prod1x0246, vq31prod1x1357, 0xCC);
     const __m256i vq31prod2x01234567 = _mm256_blend_epi16(vq31prod2x0246, vq31prod2x1357, 0xCC);
 
-    const __m256i vremainder_mask = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_mask));
+    const __m256i vremainder_mask = _mm256_load_si256((const __m256i*) params->avx2.remainder_mask);
     const __m256i vrem0x01234567 =
       _mm256_add_epi32(_mm256_and_si256(vq31prod0x01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod0x01234567));
     const __m256i vrem1x01234567 =
@@ -184,8 +184,8 @@
     const __m256i vrem2x01234567 =
       _mm256_add_epi32(_mm256_and_si256(vq31prod2x01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod2x01234567));
 
-    const __m256i vremainder_threshold = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_threshold));
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m256i vremainder_threshold = _mm256_load_si256((const __m256i*) params->avx2.remainder_threshold);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->avx2.shift);
     vacc0x01234567 =
       _mm256_sub_epi32(_mm256_sra_epi32(vq31prod0x01234567, vshift), _mm256_cmpgt_epi32(vrem0x01234567, vremainder_threshold));
     vacc1x01234567 =
@@ -193,19 +193,18 @@
     vacc2x01234567 =
       _mm256_sub_epi32(_mm256_sra_epi32(vq31prod2x01234567, vshift), _mm256_cmpgt_epi32(vrem2x01234567, vremainder_threshold));
 
-    const __m256i voutput_zero_point = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_zero_point));
+    const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx2.output_zero_point);
     __m256i vacc01x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc0x01234567, vacc1x01234567), voutput_zero_point);
     __m256i vacc22x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc2x01234567, vacc2x01234567), voutput_zero_point);
 
     vacc01x01234567 = _mm256_permute4x64_epi64(vacc01x01234567, _MM_SHUFFLE(3, 1, 2, 0));
     vacc22x01234567 = _mm256_permute4x64_epi64(vacc22x01234567, _MM_SHUFFLE(3, 1, 2, 0));
 
-    const __m256i voutput_min = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_min));
-    const __m256i voutput_max = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_max));
-    vacc01x01234567 = _mm256_min_epi16(_mm256_max_epi16(vacc01x01234567, voutput_min), voutput_max);
-    vacc22x01234567 = _mm256_min_epi16(_mm256_max_epi16(vacc22x01234567, voutput_min), voutput_max);
-
     __m256i vout = _mm256_packs_epi16(vacc01x01234567, vacc22x01234567);
+
+    vout = _mm256_max_epi8(vout, _mm256_load_si256((const __m256i*) params->avx2.output_min));
+    vout = _mm256_min_epi8(vout, _mm256_load_si256((const __m256i*) params->avx2.output_max));
+
     __m128i vout_lo = _mm256_castsi256_si128(vout);
     __m128i vout_hi = _mm256_extracti128_si256(vout, 1);
 
diff --git a/src/qs8-igemm/gen/4x16c8-minmax-avx512skx.c b/src/qs8-igemm/gen/4x16c8-minmax-avx512skx.c
index 4828185..2752f79 100644
--- a/src/qs8-igemm/gen/4x16c8-minmax-avx512skx.c
+++ b/src/qs8-igemm/gen/4x16c8-minmax-avx512skx.c
@@ -61,9 +61,9 @@
   const __m512i vremainder_mask = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse2.remainder_mask));
   const __m512i vremainder_threshold = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse2.remainder_threshold));
   const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
-  const __m512i voutput_zero_point = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse2.output_zero_point));
-  const __m512i voutput_min = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse2.output_min));
-  const __m512i voutput_max = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse2.output_max));
+  const __m512i voutput_zero_point = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse4.output_zero_point));
+  const __m512i voutput_min = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse4.output_min));
+  const __m512i voutput_max = _mm512_broadcast_i32x4(_mm_load_si128((const __m128i*) params->sse4.output_max));
   do {
     __m512i vacc0x0123 = _mm512_maskz_expandloadu_epi32(vbias_mask, w);
     __m512i vacc0x4567 = _mm512_maskz_expandloadu_epi32(vbias_mask, (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)));
@@ -212,15 +212,14 @@
     vacc3x084C195D2A6E3B7F =
       _mm512_mask_sub_epi32(vacc3x084C195D2A6E3B7F, _mm512_cmpgt_epi32_mask(vrem3x084C195D2A6E3B7F, vremainder_threshold), vacc3x084C195D2A6E3B7F, vminus_one);
 
-    __m512i vacc01x084Cx195Dx2A6Ex3B7F = _mm512_adds_epi16(_mm512_packs_epi32(vacc0x084C195D2A6E3B7F, vacc1x084C195D2A6E3B7F), voutput_zero_point);
-    __m512i vacc23x084Cx195Dx2A6Ex3B7F = _mm512_adds_epi16(_mm512_packs_epi32(vacc2x084C195D2A6E3B7F, vacc3x084C195D2A6E3B7F), voutput_zero_point);
-
-    vacc01x084Cx195Dx2A6Ex3B7F = _mm512_min_epi16(_mm512_max_epi16(vacc01x084Cx195Dx2A6Ex3B7F, voutput_min), voutput_max);
-    vacc23x084Cx195Dx2A6Ex3B7F = _mm512_min_epi16(_mm512_max_epi16(vacc23x084Cx195Dx2A6Ex3B7F, voutput_min), voutput_max);
+    const __m512i vacc01x084Cx195Dx2A6Ex3B7F = _mm512_adds_epi16(_mm512_packs_epi32(vacc0x084C195D2A6E3B7F, vacc1x084C195D2A6E3B7F), voutput_zero_point);
+    const __m512i vacc23x084Cx195Dx2A6Ex3B7F = _mm512_adds_epi16(_mm512_packs_epi32(vacc2x084C195D2A6E3B7F, vacc3x084C195D2A6E3B7F), voutput_zero_point);
 
     __m512i vout0123x084Cx195Dx2A6Ex3B7F = _mm512_packs_epi16(vacc01x084Cx195Dx2A6Ex3B7F, vacc23x084Cx195Dx2A6Ex3B7F);
     vout0123x084Cx195Dx2A6Ex3B7F = _mm512_permutexvar_epi32(_mm512_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0), vout0123x084Cx195Dx2A6Ex3B7F);
-    const __m512i vout0123x0123456789ABCDEF = _mm512_shuffle_epi8(vout0123x084Cx195Dx2A6Ex3B7F, _mm512_set_epi8(15, 11, 7, 3, 13, 9, 5, 1, 14, 10, 6, 2, 12, 8, 4, 0, 15, 11, 7, 3, 13, 9, 5, 1, 14, 10, 6, 2, 12, 8, 4, 0, 15, 11, 7, 3, 13, 9, 5, 1, 14, 10, 6, 2, 12, 8, 4, 0, 15, 11, 7, 3, 13, 9, 5, 1, 14, 10, 6, 2, 12, 8, 4, 0));
+    __m512i vout0123x0123456789ABCDEF = _mm512_shuffle_epi8(vout0123x084Cx195Dx2A6Ex3B7F, _mm512_set_epi8(15, 11, 7, 3, 13, 9, 5, 1, 14, 10, 6, 2, 12, 8, 4, 0, 15, 11, 7, 3, 13, 9, 5, 1, 14, 10, 6, 2, 12, 8, 4, 0, 15, 11, 7, 3, 13, 9, 5, 1, 14, 10, 6, 2, 12, 8, 4, 0, 15, 11, 7, 3, 13, 9, 5, 1, 14, 10, 6, 2, 12, 8, 4, 0));
+    vout0123x0123456789ABCDEF = _mm512_max_epi8(vout0123x0123456789ABCDEF, voutput_min);
+    vout0123x0123456789ABCDEF = _mm512_min_epi8(vout0123x0123456789ABCDEF, voutput_max);
 
     if (nc >= 16) {
       _mm_storeu_si128((__m128i*) c3, _mm512_extracti32x4_epi32(vout0123x0123456789ABCDEF, 3));
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-avx-ld128.c b/src/qs8-igemm/gen/4x4c2-minmax-avx-ld128.c
index 90e9f37..e341603 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-avx-ld128.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-avx-ld128.c
@@ -205,8 +205,8 @@
       p -= 4 * sizeof(void*);
     } while (p != 0);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -237,7 +237,7 @@
     const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
     const __m128i vq31prod3x0123 = _mm_blend_epi16(vq31prod3x02, vq31prod3x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
@@ -247,8 +247,8 @@
     const __m128i vrem3x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod3x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod3x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -258,17 +258,16 @@
     vacc3x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod3x0123, vshift), _mm_cmpgt_epi32(vrem3x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
     __m128i vacc23x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc3x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
-    vacc23x0123 = _mm_min_epi16(_mm_max_epi16(vacc23x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc23x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c3) = (uint32_t) _mm_extract_epi32(vout, 3);
       c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-avx-ld64.c b/src/qs8-igemm/gen/4x4c2-minmax-avx-ld64.c
index 64aa971..2d815c9 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-avx-ld64.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-avx-ld64.c
@@ -205,8 +205,8 @@
       p -= 4 * sizeof(void*);
     } while (p != 0);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -237,7 +237,7 @@
     const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
     const __m128i vq31prod3x0123 = _mm_blend_epi16(vq31prod3x02, vq31prod3x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
@@ -247,8 +247,8 @@
     const __m128i vrem3x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod3x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod3x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -258,17 +258,16 @@
     vacc3x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod3x0123, vshift), _mm_cmpgt_epi32(vrem3x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
     __m128i vacc23x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc3x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
-    vacc23x0123 = _mm_min_epi16(_mm_max_epi16(vacc23x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc23x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c3) = (uint32_t) _mm_extract_epi32(vout, 3);
       c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-sse2-ld128.c b/src/qs8-igemm/gen/4x4c2-minmax-sse2-ld128.c
index 725a7aa..3e86a97 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-sse2-ld128.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-sse2-ld128.c
@@ -288,7 +288,7 @@
       _mm_add_epi32(_mm_and_si128(vq31prod3x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod3x0123));
 
     const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse2.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -309,6 +309,7 @@
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc23x0123);
 
+
     if (nc >= 4) {
       *((uint32_t*) c3) = (uint32_t) _mm_cvtsi128_si32(_mm_shuffle_epi32(vout, _MM_SHUFFLE(3, 3, 3, 3)));
       c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-sse2-ld64.c b/src/qs8-igemm/gen/4x4c2-minmax-sse2-ld64.c
index 636f501..bdb9caa 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-sse2-ld64.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-sse2-ld64.c
@@ -288,7 +288,7 @@
       _mm_add_epi32(_mm_and_si128(vq31prod3x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod3x0123));
 
     const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse2.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -309,6 +309,7 @@
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc23x0123);
 
+
     if (nc >= 4) {
       *((uint32_t*) c3) = (uint32_t) _mm_cvtsi128_si32(_mm_shuffle_epi32(vout, _MM_SHUFFLE(3, 3, 3, 3)));
       c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-sse41-ld128.c b/src/qs8-igemm/gen/4x4c2-minmax-sse41-ld128.c
index 0d511f1..09e76c0 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-sse41-ld128.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-sse41-ld128.c
@@ -205,8 +205,8 @@
       p -= 4 * sizeof(void*);
     } while (p != 0);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -237,7 +237,7 @@
     const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
     const __m128i vq31prod3x0123 = _mm_blend_epi16(vq31prod3x02, vq31prod3x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
@@ -247,8 +247,8 @@
     const __m128i vrem3x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod3x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod3x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -258,17 +258,16 @@
     vacc3x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod3x0123, vshift), _mm_cmpgt_epi32(vrem3x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
     __m128i vacc23x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc3x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
-    vacc23x0123 = _mm_min_epi16(_mm_max_epi16(vacc23x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc23x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c3) = (uint32_t) _mm_extract_epi32(vout, 3);
       c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-sse41-ld64.c b/src/qs8-igemm/gen/4x4c2-minmax-sse41-ld64.c
index 894e46a..66c499c 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-sse41-ld64.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-sse41-ld64.c
@@ -205,8 +205,8 @@
       p -= 4 * sizeof(void*);
     } while (p != 0);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -237,7 +237,7 @@
     const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
     const __m128i vq31prod3x0123 = _mm_blend_epi16(vq31prod3x02, vq31prod3x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
@@ -247,8 +247,8 @@
     const __m128i vrem3x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod3x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod3x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -258,17 +258,16 @@
     vacc3x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod3x0123, vshift), _mm_cmpgt_epi32(vrem3x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
     __m128i vacc23x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc3x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
-    vacc23x0123 = _mm_min_epi16(_mm_max_epi16(vacc23x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc23x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c3) = (uint32_t) _mm_extract_epi32(vout, 3);
       c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-ssse3-ld128.c b/src/qs8-igemm/gen/4x4c2-minmax-ssse3-ld128.c
index 2d1aba8..d8559fe 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-ssse3-ld128.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-ssse3-ld128.c
@@ -288,7 +288,7 @@
       _mm_add_epi32(_mm_and_si128(vq31prod3x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod3x0123));
 
     const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse2.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -309,6 +309,7 @@
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc23x0123);
 
+
     if (nc >= 4) {
       *((uint32_t*) c3) = (uint32_t) _mm_cvtsi128_si32(_mm_shuffle_epi32(vout, _MM_SHUFFLE(3, 3, 3, 3)));
       c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-ssse3-ld64.c b/src/qs8-igemm/gen/4x4c2-minmax-ssse3-ld64.c
index 11ff0c6..2318f10 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-ssse3-ld64.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-ssse3-ld64.c
@@ -288,7 +288,7 @@
       _mm_add_epi32(_mm_and_si128(vq31prod3x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod3x0123));
 
     const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse2.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -309,6 +309,7 @@
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc23x0123);
 
+
     if (nc >= 4) {
       *((uint32_t*) c3) = (uint32_t) _mm_cvtsi128_si32(_mm_shuffle_epi32(vout, _MM_SHUFFLE(3, 3, 3, 3)));
       c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-xop-ld128.c b/src/qs8-igemm/gen/4x4c2-minmax-xop-ld128.c
index 16a0c0a..0ce5964 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-xop-ld128.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-xop-ld128.c
@@ -210,8 +210,8 @@
       p -= 4 * sizeof(void*);
     } while (p != 0);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -242,7 +242,7 @@
     const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
     const __m128i vq31prod3x0123 = _mm_blend_epi16(vq31prod3x02, vq31prod3x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
@@ -252,8 +252,8 @@
     const __m128i vrem3x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod3x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod3x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -263,17 +263,16 @@
     vacc3x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod3x0123, vshift), _mm_cmpgt_epi32(vrem3x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
     __m128i vacc23x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc3x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
-    vacc23x0123 = _mm_min_epi16(_mm_max_epi16(vacc23x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc23x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c3) = (uint32_t) _mm_extract_epi32(vout, 3);
       c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-xop-ld64.c b/src/qs8-igemm/gen/4x4c2-minmax-xop-ld64.c
index b4b0e7e..b44fa57 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-xop-ld64.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-xop-ld64.c
@@ -210,8 +210,8 @@
       p -= 4 * sizeof(void*);
     } while (p != 0);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse4.multiplier);
+    const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
 
     const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
     const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -242,7 +242,7 @@
     const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
     const __m128i vq31prod3x0123 = _mm_blend_epi16(vq31prod3x02, vq31prod3x13, 0xCC);
 
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse4.remainder_mask);
     const __m128i vrem0x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
     const __m128i vrem1x0123 =
@@ -252,8 +252,8 @@
     const __m128i vrem3x0123 =
       _mm_add_epi32(_mm_and_si128(vq31prod3x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod3x0123));
 
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse4.remainder_threshold);
+    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse4.shift);
     vacc0x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
     vacc1x0123 =
@@ -263,17 +263,16 @@
     vacc3x0123 =
       _mm_sub_epi32(_mm_sra_epi32(vq31prod3x0123, vshift), _mm_cmpgt_epi32(vrem3x0123, vremainder_threshold));
 
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
     __m128i vacc23x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc3x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
-    vacc23x0123 = _mm_min_epi16(_mm_max_epi16(vacc23x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc23x0123);
 
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
     if (nc >= 4) {
       *((uint32_t*) c3) = (uint32_t) _mm_extract_epi32(vout, 3);
       c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
diff --git a/src/xnnpack/params-init.h b/src/xnnpack/params-init.h
index d396625..137e859 100644
--- a/src/xnnpack/params-init.h
+++ b/src/xnnpack/params-init.h
@@ -224,6 +224,99 @@
     params->sse2.output_max[i] = (int16_t) output_max;
   }
 }
+
+static inline void xnn_init_qs8_gemm_sse4_params(
+  union xnn_qs8_gemm_params params[XNN_MIN_ELEMENTS(1)],
+  float scale,
+  int8_t output_zero_point,
+  int8_t output_min,
+  int8_t output_max)
+{
+  // Compute requantization parameters.
+  const uint32_t scale_bits = fp32_to_bits(scale);
+
+  // Multiplier is in [0x40000000, 0x7FFFFF80] range.
+  const int32_t multiplier = (int32_t)(((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7);
+  assert(multiplier >= INT32_C(0x40000000));
+  assert(multiplier <= INT32_C(0x7FFFFF80));
+
+  // Shift is in [0, 31] range.
+  const int32_t shift = 127 + 31 - 32 - (fp32_to_bits(scale) >> 23);
+  assert(shift >= 0);
+  assert(shift < 32);
+
+  const uint32_t remainder_mask = (UINT32_C(1) << shift) - UINT32_C(1);
+  const uint32_t remainder_threshold = remainder_mask >> 1;
+  params->sse4.multiplier[0] = multiplier;
+  params->sse4.multiplier[1] = multiplier;
+  params->sse4.multiplier[2] = multiplier;
+  params->sse4.multiplier[3] = multiplier;
+  params->sse4.rounding[0] = UINT64_C(0x40000000);
+  params->sse4.rounding[1] = UINT64_C(0x40000000);
+  params->sse4.remainder_mask[0] = (int32_t) remainder_mask;
+  params->sse4.remainder_mask[1] = (int32_t) remainder_mask;
+  params->sse4.remainder_mask[2] = (int32_t) remainder_mask;
+  params->sse4.remainder_mask[3] = (int32_t) remainder_mask;
+  params->sse4.remainder_threshold[0] = (int32_t) remainder_threshold;
+  params->sse4.remainder_threshold[1] = (int32_t) remainder_threshold;
+  params->sse4.remainder_threshold[2] = (int32_t) remainder_threshold;
+  params->sse4.remainder_threshold[3] = (int32_t) remainder_threshold;
+  params->sse4.shift[0] = (uint64_t) (uint32_t) shift;
+  params->sse4.shift[1] = (uint64_t) (uint32_t) shift;
+  for (uint32_t i = 0; i < 8; i++) {
+    params->sse4.output_zero_point[i] = (int16_t) output_zero_point;
+  }
+  for (uint32_t i = 0; i < 16; i++) {
+    params->sse4.output_min[i] = output_min;
+    params->sse4.output_max[i] = output_max;
+  }
+}
+
+static inline void xnn_init_qs8_gemm_avx2_params(
+  union xnn_qs8_gemm_params params[XNN_MIN_ELEMENTS(1)],
+  float scale,
+  int8_t output_zero_point,
+  int8_t output_min,
+  int8_t output_max)
+{
+  // Compute requantization parameters.
+  const uint32_t scale_bits = fp32_to_bits(scale);
+
+  // Multiplier is in [0x40000000, 0x7FFFFF80] range.
+  const int32_t multiplier = (int32_t)(((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7);
+  assert(multiplier >= INT32_C(0x40000000));
+  assert(multiplier <= INT32_C(0x7FFFFF80));
+
+  // Shift is in [0, 31] range.
+  const int32_t shift = 127 + 31 - 32 - (fp32_to_bits(scale) >> 23);
+  assert(shift >= 0);
+  assert(shift < 32);
+
+  const uint32_t remainder_mask = (UINT32_C(1) << shift) - UINT32_C(1);
+  const uint32_t remainder_threshold = remainder_mask >> 1;
+  for (uint32_t i = 0; i < 8; i++) {
+    params->avx2.multiplier[i] = multiplier;
+  }
+  params->avx2.rounding[0] = UINT64_C(0x40000000);
+  params->avx2.rounding[1] = UINT64_C(0x40000000);
+  params->avx2.rounding[2] = UINT64_C(0x40000000);
+  params->avx2.rounding[3] = UINT64_C(0x40000000);
+  for (uint32_t i = 0; i < 8; i++) {
+    params->avx2.remainder_mask[i] = (int32_t) remainder_mask;
+    params->avx2.remainder_threshold[i] = (int32_t) remainder_threshold;
+  }
+  params->avx2.shift[0] = (uint64_t) (uint32_t) shift;
+  params->avx2.shift[1] = (uint64_t) (uint32_t) shift;
+  params->avx2.shift[2] = (uint64_t) (uint32_t) shift;
+  params->avx2.shift[3] = (uint64_t) (uint32_t) shift;
+  for (uint32_t i = 0; i < 16; i++) {
+    params->avx2.output_zero_point[i] = (int16_t) output_zero_point;
+  }
+  for (uint32_t i = 0; i < 32; i++) {
+    params->avx2.output_min[i] = output_min;
+    params->avx2.output_max[i] = output_max;
+  }
+}
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
diff --git a/src/xnnpack/params.h b/src/xnnpack/params.h
index 66625c4..43a400b 100644
--- a/src/xnnpack/params.h
+++ b/src/xnnpack/params.h
@@ -308,6 +308,26 @@
     XNN_ALIGN(16) int16_t output_min[8];
     XNN_ALIGN(16) int16_t output_max[8];
   } sse2;
+  struct {
+    XNN_ALIGN(16) uint32_t multiplier[4];
+    XNN_ALIGN(16) uint64_t rounding[2];
+    XNN_ALIGN(16) int32_t remainder_mask[4];
+    XNN_ALIGN(16) int32_t remainder_threshold[4];
+    XNN_ALIGN(16) uint64_t shift[2];
+    XNN_ALIGN(16) int16_t output_zero_point[8];
+    XNN_ALIGN(16) int8_t output_min[16];
+    XNN_ALIGN(16) int8_t output_max[16];
+  } sse4;
+  struct {
+    XNN_ALIGN(32) uint32_t multiplier[8];
+    XNN_ALIGN(32) uint64_t rounding[4];
+    XNN_ALIGN(32) int32_t remainder_mask[8];
+    XNN_ALIGN(32) int32_t remainder_threshold[8];
+    XNN_ALIGN(32) uint64_t shift[4];
+    XNN_ALIGN(32) int16_t output_zero_point[16];
+    XNN_ALIGN(32) int8_t output_min[32];
+    XNN_ALIGN(32) int8_t output_max[32];
+  } avx2;
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 #if XNN_ARCH_WASMSIMD
   struct {
diff --git a/test/qs8-dwconv-minmax.cc b/test/qs8-dwconv-minmax.cc
index f10c5b7..6deb3e6 100644
--- a/test/qs8-dwconv-minmax.cc
+++ b/test/qs8-dwconv-minmax.cc
@@ -1887,7 +1887,7 @@
       .cr(8)
       .kr(9)
       .channels(8)
-      .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_DWCONV_MINMAX_UP8X9__SSE41_MUL16, c_div_8) {
@@ -1897,7 +1897,7 @@
         .cr(8)
         .kr(9)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -1909,7 +1909,7 @@
         .kr(9)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -1921,7 +1921,7 @@
         .kr(9)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -1932,7 +1932,7 @@
         .cr(8)
         .kr(9)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -1943,7 +1943,7 @@
         .cr(8)
         .kr(9)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -1955,7 +1955,7 @@
         .kr(9)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -1967,7 +1967,7 @@
         .kr(9)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -1979,7 +1979,7 @@
         .kr(9)
         .channels(channels)
         .width(3)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -1993,7 +1993,7 @@
           .channels(channels)
           .width(3)
           .step(step)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -2007,7 +2007,7 @@
         .channels(8)
         .width(5)
         .output_stride(43)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -2020,7 +2020,7 @@
         .channels(channels)
         .width(3)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -2033,7 +2033,7 @@
         .channels(channels)
         .width(3)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -2045,7 +2045,7 @@
         .kr(9)
         .channels(channels)
         .input_offset(176)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -2059,7 +2059,7 @@
           .channels(channels)
           .input_offset(176)
           .zero_index(mz)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -2073,7 +2073,7 @@
       .cr(16)
       .kr(9)
       .channels(16)
-      .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_DWCONV_MINMAX_UP16X9__SSE41_MUL16, c_div_16) {
@@ -2083,7 +2083,7 @@
         .cr(16)
         .kr(9)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -2095,7 +2095,7 @@
         .kr(9)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -2107,7 +2107,7 @@
         .kr(9)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -2118,7 +2118,7 @@
         .cr(16)
         .kr(9)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -2129,7 +2129,7 @@
         .cr(16)
         .kr(9)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -2141,7 +2141,7 @@
         .kr(9)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -2153,7 +2153,7 @@
         .kr(9)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -2165,7 +2165,7 @@
         .kr(9)
         .channels(channels)
         .width(3)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -2179,7 +2179,7 @@
           .channels(channels)
           .width(3)
           .step(step)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -2193,7 +2193,7 @@
         .channels(16)
         .width(5)
         .output_stride(83)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -2206,7 +2206,7 @@
         .channels(channels)
         .width(3)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -2219,7 +2219,7 @@
         .channels(channels)
         .width(3)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -2231,7 +2231,7 @@
         .kr(9)
         .channels(channels)
         .input_offset(304)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -2245,7 +2245,7 @@
           .channels(channels)
           .input_offset(304)
           .zero_index(mz)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -2259,7 +2259,7 @@
       .cr(24)
       .kr(9)
       .channels(24)
-      .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_DWCONV_MINMAX_UP24X9__SSE41_MUL16, c_div_24) {
@@ -2269,7 +2269,7 @@
         .cr(24)
         .kr(9)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -2281,7 +2281,7 @@
         .kr(9)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -2293,7 +2293,7 @@
         .kr(9)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -2304,7 +2304,7 @@
         .cr(24)
         .kr(9)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -2315,7 +2315,7 @@
         .cr(24)
         .kr(9)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -2327,7 +2327,7 @@
         .kr(9)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -2339,7 +2339,7 @@
         .kr(9)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -2351,7 +2351,7 @@
         .kr(9)
         .channels(channels)
         .width(3)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -2365,7 +2365,7 @@
           .channels(channels)
           .width(3)
           .step(step)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -2379,7 +2379,7 @@
         .channels(24)
         .width(5)
         .output_stride(127)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -2392,7 +2392,7 @@
         .channels(channels)
         .width(3)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -2405,7 +2405,7 @@
         .channels(channels)
         .width(3)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -2417,7 +2417,7 @@
         .kr(9)
         .channels(channels)
         .input_offset(464)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -2431,7 +2431,7 @@
           .channels(channels)
           .input_offset(464)
           .zero_index(mz)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -2445,7 +2445,7 @@
       .cr(8)
       .kr(9)
       .channels(8)
-      .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx_mul16, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_DWCONV_MINMAX_UP8X9__AVX_MUL16, c_div_8) {
@@ -2455,7 +2455,7 @@
         .cr(8)
         .kr(9)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -2467,7 +2467,7 @@
         .kr(9)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -2479,7 +2479,7 @@
         .kr(9)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -2490,7 +2490,7 @@
         .cr(8)
         .kr(9)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -2501,7 +2501,7 @@
         .cr(8)
         .kr(9)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -2513,7 +2513,7 @@
         .kr(9)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -2525,7 +2525,7 @@
         .kr(9)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -2537,7 +2537,7 @@
         .kr(9)
         .channels(channels)
         .width(3)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -2551,7 +2551,7 @@
           .channels(channels)
           .width(3)
           .step(step)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx_mul16, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -2565,7 +2565,7 @@
         .channels(8)
         .width(5)
         .output_stride(43)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -2578,7 +2578,7 @@
         .channels(channels)
         .width(3)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -2591,7 +2591,7 @@
         .channels(channels)
         .width(3)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -2603,7 +2603,7 @@
         .kr(9)
         .channels(channels)
         .input_offset(176)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -2617,7 +2617,7 @@
           .channels(channels)
           .input_offset(176)
           .zero_index(mz)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx_mul16, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -2631,7 +2631,7 @@
       .cr(16)
       .kr(9)
       .channels(16)
-      .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx_mul16, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_DWCONV_MINMAX_UP16X9__AVX_MUL16, c_div_16) {
@@ -2641,7 +2641,7 @@
         .cr(16)
         .kr(9)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -2653,7 +2653,7 @@
         .kr(9)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -2665,7 +2665,7 @@
         .kr(9)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -2676,7 +2676,7 @@
         .cr(16)
         .kr(9)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -2687,7 +2687,7 @@
         .cr(16)
         .kr(9)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -2699,7 +2699,7 @@
         .kr(9)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -2711,7 +2711,7 @@
         .kr(9)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -2723,7 +2723,7 @@
         .kr(9)
         .channels(channels)
         .width(3)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -2737,7 +2737,7 @@
           .channels(channels)
           .width(3)
           .step(step)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx_mul16, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -2751,7 +2751,7 @@
         .channels(16)
         .width(5)
         .output_stride(83)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -2764,7 +2764,7 @@
         .channels(channels)
         .width(3)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -2777,7 +2777,7 @@
         .channels(channels)
         .width(3)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -2789,7 +2789,7 @@
         .kr(9)
         .channels(channels)
         .input_offset(304)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -2803,7 +2803,7 @@
           .channels(channels)
           .input_offset(304)
           .zero_index(mz)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx_mul16, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -2817,7 +2817,7 @@
       .cr(24)
       .kr(9)
       .channels(24)
-      .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx_mul16, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_DWCONV_MINMAX_UP24X9__AVX_MUL16, c_div_24) {
@@ -2827,7 +2827,7 @@
         .cr(24)
         .kr(9)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -2839,7 +2839,7 @@
         .kr(9)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -2851,7 +2851,7 @@
         .kr(9)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -2862,7 +2862,7 @@
         .cr(24)
         .kr(9)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -2873,7 +2873,7 @@
         .cr(24)
         .kr(9)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -2885,7 +2885,7 @@
         .kr(9)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -2897,7 +2897,7 @@
         .kr(9)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -2909,7 +2909,7 @@
         .kr(9)
         .channels(channels)
         .width(3)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -2923,7 +2923,7 @@
           .channels(channels)
           .width(3)
           .step(step)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx_mul16, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -2937,7 +2937,7 @@
         .channels(24)
         .width(5)
         .output_stride(127)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -2950,7 +2950,7 @@
         .channels(channels)
         .width(3)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -2963,7 +2963,7 @@
         .channels(channels)
         .width(3)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -2975,7 +2975,7 @@
         .kr(9)
         .channels(channels)
         .input_offset(464)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -2989,7 +2989,7 @@
           .channels(channels)
           .input_offset(464)
           .zero_index(mz)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx_mul16, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -3003,7 +3003,7 @@
       .cr(16)
       .kr(9)
       .channels(16)
-      .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx2_mul16, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx2_mul16, xnn_init_qs8_gemm_avx2_params);
   }
 
   TEST(QS8_DWCONV_MINMAX_UP16X9__AVX2_MUL16, c_div_16) {
@@ -3013,7 +3013,7 @@
         .cr(16)
         .kr(9)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx2_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx2_mul16, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -3025,7 +3025,7 @@
         .kr(9)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx2_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx2_mul16, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -3037,7 +3037,7 @@
         .kr(9)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx2_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx2_mul16, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -3048,7 +3048,7 @@
         .cr(16)
         .kr(9)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx2_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx2_mul16, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -3059,7 +3059,7 @@
         .cr(16)
         .kr(9)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx2_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx2_mul16, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -3071,7 +3071,7 @@
         .kr(9)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx2_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx2_mul16, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -3083,7 +3083,7 @@
         .kr(9)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx2_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx2_mul16, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -3095,7 +3095,7 @@
         .kr(9)
         .channels(channels)
         .width(3)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx2_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx2_mul16, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -3109,7 +3109,7 @@
           .channels(channels)
           .width(3)
           .step(step)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx2_mul16, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx2_mul16, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -3123,7 +3123,7 @@
         .channels(16)
         .width(5)
         .output_stride(83)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx2_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx2_mul16, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -3136,7 +3136,7 @@
         .channels(channels)
         .width(3)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx2_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx2_mul16, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -3149,7 +3149,7 @@
         .channels(channels)
         .width(3)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx2_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx2_mul16, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -3161,7 +3161,7 @@
         .kr(9)
         .channels(channels)
         .input_offset(304)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx2_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx2_mul16, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -3175,7 +3175,7 @@
           .channels(channels)
           .input_offset(304)
           .zero_index(mz)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx2_mul16, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx2_mul16, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -3189,7 +3189,7 @@
       .cr(32)
       .kr(9)
       .channels(32)
-      .Test(xnn_qs8_dwconv_minmax_ukernel_up32x9__avx2_mul16, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_dwconv_minmax_ukernel_up32x9__avx2_mul16, xnn_init_qs8_gemm_avx2_params);
   }
 
   TEST(QS8_DWCONV_MINMAX_UP32X9__AVX2_MUL16, c_div_32) {
@@ -3199,7 +3199,7 @@
         .cr(32)
         .kr(9)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x9__avx2_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x9__avx2_mul16, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -3211,7 +3211,7 @@
         .kr(9)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x9__avx2_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x9__avx2_mul16, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -3223,7 +3223,7 @@
         .kr(9)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x9__avx2_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x9__avx2_mul16, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -3234,7 +3234,7 @@
         .cr(32)
         .kr(9)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x9__avx2_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x9__avx2_mul16, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -3245,7 +3245,7 @@
         .cr(32)
         .kr(9)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x9__avx2_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x9__avx2_mul16, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -3257,7 +3257,7 @@
         .kr(9)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x9__avx2_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x9__avx2_mul16, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -3269,7 +3269,7 @@
         .kr(9)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x9__avx2_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x9__avx2_mul16, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -3281,7 +3281,7 @@
         .kr(9)
         .channels(channels)
         .width(3)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x9__avx2_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x9__avx2_mul16, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -3295,7 +3295,7 @@
           .channels(channels)
           .width(3)
           .step(step)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up32x9__avx2_mul16, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up32x9__avx2_mul16, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -3309,7 +3309,7 @@
         .channels(32)
         .width(5)
         .output_stride(163)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x9__avx2_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x9__avx2_mul16, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -3322,7 +3322,7 @@
         .channels(channels)
         .width(3)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x9__avx2_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x9__avx2_mul16, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -3335,7 +3335,7 @@
         .channels(channels)
         .width(3)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x9__avx2_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x9__avx2_mul16, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -3347,7 +3347,7 @@
         .kr(9)
         .channels(channels)
         .input_offset(592)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x9__avx2_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x9__avx2_mul16, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -3361,7 +3361,7 @@
           .channels(channels)
           .input_offset(592)
           .zero_index(mz)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up32x9__avx2_mul16, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up32x9__avx2_mul16, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -3375,7 +3375,7 @@
       .cr(8)
       .kr(9)
       .channels(8)
-      .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_DWCONV_MINMAX_UP8X9__SSE41_MUL32, c_div_8) {
@@ -3385,7 +3385,7 @@
         .cr(8)
         .kr(9)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -3397,7 +3397,7 @@
         .kr(9)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -3409,7 +3409,7 @@
         .kr(9)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -3420,7 +3420,7 @@
         .cr(8)
         .kr(9)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -3431,7 +3431,7 @@
         .cr(8)
         .kr(9)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -3443,7 +3443,7 @@
         .kr(9)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -3455,7 +3455,7 @@
         .kr(9)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -3467,7 +3467,7 @@
         .kr(9)
         .channels(channels)
         .width(3)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -3481,7 +3481,7 @@
           .channels(channels)
           .width(3)
           .step(step)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -3495,7 +3495,7 @@
         .channels(8)
         .width(5)
         .output_stride(43)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -3508,7 +3508,7 @@
         .channels(channels)
         .width(3)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -3521,7 +3521,7 @@
         .channels(channels)
         .width(3)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -3533,7 +3533,7 @@
         .kr(9)
         .channels(channels)
         .input_offset(176)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -3547,7 +3547,7 @@
           .channels(channels)
           .input_offset(176)
           .zero_index(mz)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -3561,7 +3561,7 @@
       .cr(16)
       .kr(9)
       .channels(16)
-      .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_DWCONV_MINMAX_UP16X9__SSE41_MUL32, c_div_16) {
@@ -3571,7 +3571,7 @@
         .cr(16)
         .kr(9)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -3583,7 +3583,7 @@
         .kr(9)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -3595,7 +3595,7 @@
         .kr(9)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -3606,7 +3606,7 @@
         .cr(16)
         .kr(9)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -3617,7 +3617,7 @@
         .cr(16)
         .kr(9)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -3629,7 +3629,7 @@
         .kr(9)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -3641,7 +3641,7 @@
         .kr(9)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -3653,7 +3653,7 @@
         .kr(9)
         .channels(channels)
         .width(3)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -3667,7 +3667,7 @@
           .channels(channels)
           .width(3)
           .step(step)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -3681,7 +3681,7 @@
         .channels(16)
         .width(5)
         .output_stride(83)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -3694,7 +3694,7 @@
         .channels(channels)
         .width(3)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -3707,7 +3707,7 @@
         .channels(channels)
         .width(3)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -3719,7 +3719,7 @@
         .kr(9)
         .channels(channels)
         .input_offset(304)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -3733,7 +3733,7 @@
           .channels(channels)
           .input_offset(304)
           .zero_index(mz)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -3747,7 +3747,7 @@
       .cr(24)
       .kr(9)
       .channels(24)
-      .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_DWCONV_MINMAX_UP24X9__SSE41_MUL32, c_div_24) {
@@ -3757,7 +3757,7 @@
         .cr(24)
         .kr(9)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -3769,7 +3769,7 @@
         .kr(9)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -3781,7 +3781,7 @@
         .kr(9)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -3792,7 +3792,7 @@
         .cr(24)
         .kr(9)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -3803,7 +3803,7 @@
         .cr(24)
         .kr(9)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -3815,7 +3815,7 @@
         .kr(9)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -3827,7 +3827,7 @@
         .kr(9)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -3839,7 +3839,7 @@
         .kr(9)
         .channels(channels)
         .width(3)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -3853,7 +3853,7 @@
           .channels(channels)
           .width(3)
           .step(step)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -3867,7 +3867,7 @@
         .channels(24)
         .width(5)
         .output_stride(127)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -3880,7 +3880,7 @@
         .channels(channels)
         .width(3)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -3893,7 +3893,7 @@
         .channels(channels)
         .width(3)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -3905,7 +3905,7 @@
         .kr(9)
         .channels(channels)
         .input_offset(464)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -3919,7 +3919,7 @@
           .channels(channels)
           .input_offset(464)
           .zero_index(mz)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -3933,7 +3933,7 @@
       .cr(8)
       .kr(9)
       .channels(8)
-      .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx_mul32, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_DWCONV_MINMAX_UP8X9__AVX_MUL32, c_div_8) {
@@ -3943,7 +3943,7 @@
         .cr(8)
         .kr(9)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -3955,7 +3955,7 @@
         .kr(9)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -3967,7 +3967,7 @@
         .kr(9)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -3978,7 +3978,7 @@
         .cr(8)
         .kr(9)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -3989,7 +3989,7 @@
         .cr(8)
         .kr(9)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -4001,7 +4001,7 @@
         .kr(9)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -4013,7 +4013,7 @@
         .kr(9)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -4025,7 +4025,7 @@
         .kr(9)
         .channels(channels)
         .width(3)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -4039,7 +4039,7 @@
           .channels(channels)
           .width(3)
           .step(step)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx_mul32, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -4053,7 +4053,7 @@
         .channels(8)
         .width(5)
         .output_stride(43)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -4066,7 +4066,7 @@
         .channels(channels)
         .width(3)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -4079,7 +4079,7 @@
         .channels(channels)
         .width(3)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -4091,7 +4091,7 @@
         .kr(9)
         .channels(channels)
         .input_offset(176)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -4105,7 +4105,7 @@
           .channels(channels)
           .input_offset(176)
           .zero_index(mz)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx_mul32, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -4119,7 +4119,7 @@
       .cr(16)
       .kr(9)
       .channels(16)
-      .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx_mul32, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_DWCONV_MINMAX_UP16X9__AVX_MUL32, c_div_16) {
@@ -4129,7 +4129,7 @@
         .cr(16)
         .kr(9)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -4141,7 +4141,7 @@
         .kr(9)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -4153,7 +4153,7 @@
         .kr(9)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -4164,7 +4164,7 @@
         .cr(16)
         .kr(9)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -4175,7 +4175,7 @@
         .cr(16)
         .kr(9)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -4187,7 +4187,7 @@
         .kr(9)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -4199,7 +4199,7 @@
         .kr(9)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -4211,7 +4211,7 @@
         .kr(9)
         .channels(channels)
         .width(3)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -4225,7 +4225,7 @@
           .channels(channels)
           .width(3)
           .step(step)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx_mul32, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -4239,7 +4239,7 @@
         .channels(16)
         .width(5)
         .output_stride(83)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -4252,7 +4252,7 @@
         .channels(channels)
         .width(3)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -4265,7 +4265,7 @@
         .channels(channels)
         .width(3)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -4277,7 +4277,7 @@
         .kr(9)
         .channels(channels)
         .input_offset(304)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -4291,7 +4291,7 @@
           .channels(channels)
           .input_offset(304)
           .zero_index(mz)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx_mul32, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -4305,7 +4305,7 @@
       .cr(24)
       .kr(9)
       .channels(24)
-      .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx_mul32, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_DWCONV_MINMAX_UP24X9__AVX_MUL32, c_div_24) {
@@ -4315,7 +4315,7 @@
         .cr(24)
         .kr(9)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -4327,7 +4327,7 @@
         .kr(9)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -4339,7 +4339,7 @@
         .kr(9)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -4350,7 +4350,7 @@
         .cr(24)
         .kr(9)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -4361,7 +4361,7 @@
         .cr(24)
         .kr(9)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -4373,7 +4373,7 @@
         .kr(9)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -4385,7 +4385,7 @@
         .kr(9)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -4397,7 +4397,7 @@
         .kr(9)
         .channels(channels)
         .width(3)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -4411,7 +4411,7 @@
           .channels(channels)
           .width(3)
           .step(step)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx_mul32, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -4425,7 +4425,7 @@
         .channels(24)
         .width(5)
         .output_stride(127)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -4438,7 +4438,7 @@
         .channels(channels)
         .width(3)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -4451,7 +4451,7 @@
         .channels(channels)
         .width(3)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -4463,7 +4463,7 @@
         .kr(9)
         .channels(channels)
         .input_offset(464)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -4477,7 +4477,7 @@
           .channels(channels)
           .input_offset(464)
           .zero_index(mz)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx_mul32, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -4491,7 +4491,7 @@
       .cr(8)
       .kr(9)
       .channels(8)
-      .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__xop_mul32, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_DWCONV_MINMAX_UP8X9__XOP_MUL32, c_div_8) {
@@ -4501,7 +4501,7 @@
         .cr(8)
         .kr(9)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__xop_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -4513,7 +4513,7 @@
         .kr(9)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__xop_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -4525,7 +4525,7 @@
         .kr(9)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__xop_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -4536,7 +4536,7 @@
         .cr(8)
         .kr(9)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__xop_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -4547,7 +4547,7 @@
         .cr(8)
         .kr(9)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__xop_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -4559,7 +4559,7 @@
         .kr(9)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__xop_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -4571,7 +4571,7 @@
         .kr(9)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__xop_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -4583,7 +4583,7 @@
         .kr(9)
         .channels(channels)
         .width(3)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__xop_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -4597,7 +4597,7 @@
           .channels(channels)
           .width(3)
           .step(step)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__xop_mul32, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -4611,7 +4611,7 @@
         .channels(8)
         .width(5)
         .output_stride(43)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__xop_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -4624,7 +4624,7 @@
         .channels(channels)
         .width(3)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__xop_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -4637,7 +4637,7 @@
         .channels(channels)
         .width(3)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__xop_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -4649,7 +4649,7 @@
         .kr(9)
         .channels(channels)
         .input_offset(176)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__xop_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -4663,7 +4663,7 @@
           .channels(channels)
           .input_offset(176)
           .zero_index(mz)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__xop_mul32, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -4677,7 +4677,7 @@
       .cr(16)
       .kr(9)
       .channels(16)
-      .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__xop_mul32, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_DWCONV_MINMAX_UP16X9__XOP_MUL32, c_div_16) {
@@ -4687,7 +4687,7 @@
         .cr(16)
         .kr(9)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__xop_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -4699,7 +4699,7 @@
         .kr(9)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__xop_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -4711,7 +4711,7 @@
         .kr(9)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__xop_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -4722,7 +4722,7 @@
         .cr(16)
         .kr(9)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__xop_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -4733,7 +4733,7 @@
         .cr(16)
         .kr(9)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__xop_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -4745,7 +4745,7 @@
         .kr(9)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__xop_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -4757,7 +4757,7 @@
         .kr(9)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__xop_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -4769,7 +4769,7 @@
         .kr(9)
         .channels(channels)
         .width(3)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__xop_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -4783,7 +4783,7 @@
           .channels(channels)
           .width(3)
           .step(step)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__xop_mul32, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -4797,7 +4797,7 @@
         .channels(16)
         .width(5)
         .output_stride(83)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__xop_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -4810,7 +4810,7 @@
         .channels(channels)
         .width(3)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__xop_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -4823,7 +4823,7 @@
         .channels(channels)
         .width(3)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__xop_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -4835,7 +4835,7 @@
         .kr(9)
         .channels(channels)
         .input_offset(304)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__xop_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -4849,7 +4849,7 @@
           .channels(channels)
           .input_offset(304)
           .zero_index(mz)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__xop_mul32, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -4863,7 +4863,7 @@
       .cr(24)
       .kr(9)
       .channels(24)
-      .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__xop_mul32, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_DWCONV_MINMAX_UP24X9__XOP_MUL32, c_div_24) {
@@ -4873,7 +4873,7 @@
         .cr(24)
         .kr(9)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__xop_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -4885,7 +4885,7 @@
         .kr(9)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__xop_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -4897,7 +4897,7 @@
         .kr(9)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__xop_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -4908,7 +4908,7 @@
         .cr(24)
         .kr(9)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__xop_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -4919,7 +4919,7 @@
         .cr(24)
         .kr(9)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__xop_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -4931,7 +4931,7 @@
         .kr(9)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__xop_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -4943,7 +4943,7 @@
         .kr(9)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__xop_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -4955,7 +4955,7 @@
         .kr(9)
         .channels(channels)
         .width(3)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__xop_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -4969,7 +4969,7 @@
           .channels(channels)
           .width(3)
           .step(step)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__xop_mul32, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -4983,7 +4983,7 @@
         .channels(24)
         .width(5)
         .output_stride(127)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__xop_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -4996,7 +4996,7 @@
         .channels(channels)
         .width(3)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__xop_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -5009,7 +5009,7 @@
         .channels(channels)
         .width(3)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__xop_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -5021,7 +5021,7 @@
         .kr(9)
         .channels(channels)
         .input_offset(464)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__xop_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -5035,7 +5035,7 @@
           .channels(channels)
           .input_offset(464)
           .zero_index(mz)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__xop_mul32, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -5049,7 +5049,7 @@
       .cr(8)
       .kr(9)
       .channels(8)
-      .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
   }
 
   TEST(QS8_DWCONV_MINMAX_UP8X9__AVX2_MUL32, c_div_8) {
@@ -5059,7 +5059,7 @@
         .cr(8)
         .kr(9)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -5071,7 +5071,7 @@
         .kr(9)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -5083,7 +5083,7 @@
         .kr(9)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -5094,7 +5094,7 @@
         .cr(8)
         .kr(9)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -5105,7 +5105,7 @@
         .cr(8)
         .kr(9)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -5117,7 +5117,7 @@
         .kr(9)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -5129,7 +5129,7 @@
         .kr(9)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -5141,7 +5141,7 @@
         .kr(9)
         .channels(channels)
         .width(3)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -5155,7 +5155,7 @@
           .channels(channels)
           .width(3)
           .step(step)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -5169,7 +5169,7 @@
         .channels(8)
         .width(5)
         .output_stride(43)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -5182,7 +5182,7 @@
         .channels(channels)
         .width(3)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -5195,7 +5195,7 @@
         .channels(channels)
         .width(3)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -5207,7 +5207,7 @@
         .kr(9)
         .channels(channels)
         .input_offset(176)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -5221,7 +5221,7 @@
           .channels(channels)
           .input_offset(176)
           .zero_index(mz)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up8x9__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -5235,7 +5235,7 @@
       .cr(16)
       .kr(9)
       .channels(16)
-      .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
   }
 
   TEST(QS8_DWCONV_MINMAX_UP16X9__AVX2_MUL32, c_div_16) {
@@ -5245,7 +5245,7 @@
         .cr(16)
         .kr(9)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -5257,7 +5257,7 @@
         .kr(9)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -5269,7 +5269,7 @@
         .kr(9)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -5280,7 +5280,7 @@
         .cr(16)
         .kr(9)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -5291,7 +5291,7 @@
         .cr(16)
         .kr(9)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -5303,7 +5303,7 @@
         .kr(9)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -5315,7 +5315,7 @@
         .kr(9)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -5327,7 +5327,7 @@
         .kr(9)
         .channels(channels)
         .width(3)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -5341,7 +5341,7 @@
           .channels(channels)
           .width(3)
           .step(step)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -5355,7 +5355,7 @@
         .channels(16)
         .width(5)
         .output_stride(83)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -5368,7 +5368,7 @@
         .channels(channels)
         .width(3)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -5381,7 +5381,7 @@
         .channels(channels)
         .width(3)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -5393,7 +5393,7 @@
         .kr(9)
         .channels(channels)
         .input_offset(304)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -5407,7 +5407,7 @@
           .channels(channels)
           .input_offset(304)
           .zero_index(mz)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up16x9__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -5421,7 +5421,7 @@
       .cr(24)
       .kr(9)
       .channels(24)
-      .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
   }
 
   TEST(QS8_DWCONV_MINMAX_UP24X9__AVX2_MUL32, c_div_24) {
@@ -5431,7 +5431,7 @@
         .cr(24)
         .kr(9)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -5443,7 +5443,7 @@
         .kr(9)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -5455,7 +5455,7 @@
         .kr(9)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -5466,7 +5466,7 @@
         .cr(24)
         .kr(9)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -5477,7 +5477,7 @@
         .cr(24)
         .kr(9)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -5489,7 +5489,7 @@
         .kr(9)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -5501,7 +5501,7 @@
         .kr(9)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -5513,7 +5513,7 @@
         .kr(9)
         .channels(channels)
         .width(3)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -5527,7 +5527,7 @@
           .channels(channels)
           .width(3)
           .step(step)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -5541,7 +5541,7 @@
         .channels(24)
         .width(5)
         .output_stride(127)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -5554,7 +5554,7 @@
         .channels(channels)
         .width(3)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -5567,7 +5567,7 @@
         .channels(channels)
         .width(3)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -5579,7 +5579,7 @@
         .kr(9)
         .channels(channels)
         .input_offset(464)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -5593,7 +5593,7 @@
           .channels(channels)
           .input_offset(464)
           .zero_index(mz)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up24x9__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -5607,7 +5607,7 @@
       .cr(32)
       .kr(9)
       .channels(32)
-      .Test(xnn_qs8_dwconv_minmax_ukernel_up32x9__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_dwconv_minmax_ukernel_up32x9__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
   }
 
   TEST(QS8_DWCONV_MINMAX_UP32X9__AVX2_MUL32, c_div_32) {
@@ -5617,7 +5617,7 @@
         .cr(32)
         .kr(9)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x9__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x9__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -5629,7 +5629,7 @@
         .kr(9)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x9__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x9__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -5641,7 +5641,7 @@
         .kr(9)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x9__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x9__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -5652,7 +5652,7 @@
         .cr(32)
         .kr(9)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x9__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x9__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -5663,7 +5663,7 @@
         .cr(32)
         .kr(9)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x9__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x9__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -5675,7 +5675,7 @@
         .kr(9)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x9__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x9__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -5687,7 +5687,7 @@
         .kr(9)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x9__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x9__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -5699,7 +5699,7 @@
         .kr(9)
         .channels(channels)
         .width(3)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x9__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x9__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -5713,7 +5713,7 @@
           .channels(channels)
           .width(3)
           .step(step)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up32x9__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up32x9__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -5727,7 +5727,7 @@
         .channels(32)
         .width(5)
         .output_stride(163)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x9__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x9__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -5740,7 +5740,7 @@
         .channels(channels)
         .width(3)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x9__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x9__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -5753,7 +5753,7 @@
         .channels(channels)
         .width(3)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x9__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x9__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -5765,7 +5765,7 @@
         .kr(9)
         .channels(channels)
         .input_offset(592)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x9__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x9__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -5779,7 +5779,7 @@
           .channels(channels)
           .input_offset(592)
           .zero_index(mz)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up32x9__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up32x9__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -9000,7 +9000,7 @@
       .cr(8)
       .kr(25)
       .channels(8)
-      .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_DWCONV_MINMAX_UP8X25__SSE41_MUL16, c_div_8) {
@@ -9010,7 +9010,7 @@
         .cr(8)
         .kr(25)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -9022,7 +9022,7 @@
         .kr(25)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -9034,7 +9034,7 @@
         .kr(25)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -9045,7 +9045,7 @@
         .cr(8)
         .kr(25)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -9056,7 +9056,7 @@
         .cr(8)
         .kr(25)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -9068,7 +9068,7 @@
         .kr(25)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -9080,7 +9080,7 @@
         .kr(25)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -9092,7 +9092,7 @@
         .kr(25)
         .channels(channels)
         .width(3)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -9106,7 +9106,7 @@
           .channels(channels)
           .width(3)
           .step(step)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -9120,7 +9120,7 @@
         .channels(8)
         .width(5)
         .output_stride(43)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -9133,7 +9133,7 @@
         .channels(channels)
         .width(3)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -9146,7 +9146,7 @@
         .channels(channels)
         .width(3)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -9158,7 +9158,7 @@
         .kr(25)
         .channels(channels)
         .input_offset(176)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -9172,7 +9172,7 @@
           .channels(channels)
           .input_offset(176)
           .zero_index(mz)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -9186,7 +9186,7 @@
       .cr(16)
       .kr(25)
       .channels(16)
-      .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_DWCONV_MINMAX_UP16X25__SSE41_MUL16, c_div_16) {
@@ -9196,7 +9196,7 @@
         .cr(16)
         .kr(25)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -9208,7 +9208,7 @@
         .kr(25)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -9220,7 +9220,7 @@
         .kr(25)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -9231,7 +9231,7 @@
         .cr(16)
         .kr(25)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -9242,7 +9242,7 @@
         .cr(16)
         .kr(25)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -9254,7 +9254,7 @@
         .kr(25)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -9266,7 +9266,7 @@
         .kr(25)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -9278,7 +9278,7 @@
         .kr(25)
         .channels(channels)
         .width(3)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -9292,7 +9292,7 @@
           .channels(channels)
           .width(3)
           .step(step)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -9306,7 +9306,7 @@
         .channels(16)
         .width(5)
         .output_stride(83)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -9319,7 +9319,7 @@
         .channels(channels)
         .width(3)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -9332,7 +9332,7 @@
         .channels(channels)
         .width(3)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -9344,7 +9344,7 @@
         .kr(25)
         .channels(channels)
         .input_offset(304)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -9358,7 +9358,7 @@
           .channels(channels)
           .input_offset(304)
           .zero_index(mz)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -9372,7 +9372,7 @@
       .cr(24)
       .kr(25)
       .channels(24)
-      .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_DWCONV_MINMAX_UP24X25__SSE41_MUL16, c_div_24) {
@@ -9382,7 +9382,7 @@
         .cr(24)
         .kr(25)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -9394,7 +9394,7 @@
         .kr(25)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -9406,7 +9406,7 @@
         .kr(25)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -9417,7 +9417,7 @@
         .cr(24)
         .kr(25)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -9428,7 +9428,7 @@
         .cr(24)
         .kr(25)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -9440,7 +9440,7 @@
         .kr(25)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -9452,7 +9452,7 @@
         .kr(25)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -9464,7 +9464,7 @@
         .kr(25)
         .channels(channels)
         .width(3)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -9478,7 +9478,7 @@
           .channels(channels)
           .width(3)
           .step(step)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -9492,7 +9492,7 @@
         .channels(24)
         .width(5)
         .output_stride(127)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -9505,7 +9505,7 @@
         .channels(channels)
         .width(3)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -9518,7 +9518,7 @@
         .channels(channels)
         .width(3)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -9530,7 +9530,7 @@
         .kr(25)
         .channels(channels)
         .input_offset(464)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -9544,7 +9544,7 @@
           .channels(channels)
           .input_offset(464)
           .zero_index(mz)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__sse41_mul16, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__sse41_mul16, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -9558,7 +9558,7 @@
       .cr(8)
       .kr(25)
       .channels(8)
-      .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx_mul16, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_DWCONV_MINMAX_UP8X25__AVX_MUL16, c_div_8) {
@@ -9568,7 +9568,7 @@
         .cr(8)
         .kr(25)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -9580,7 +9580,7 @@
         .kr(25)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -9592,7 +9592,7 @@
         .kr(25)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -9603,7 +9603,7 @@
         .cr(8)
         .kr(25)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -9614,7 +9614,7 @@
         .cr(8)
         .kr(25)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -9626,7 +9626,7 @@
         .kr(25)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -9638,7 +9638,7 @@
         .kr(25)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -9650,7 +9650,7 @@
         .kr(25)
         .channels(channels)
         .width(3)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -9664,7 +9664,7 @@
           .channels(channels)
           .width(3)
           .step(step)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx_mul16, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -9678,7 +9678,7 @@
         .channels(8)
         .width(5)
         .output_stride(43)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -9691,7 +9691,7 @@
         .channels(channels)
         .width(3)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -9704,7 +9704,7 @@
         .channels(channels)
         .width(3)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -9716,7 +9716,7 @@
         .kr(25)
         .channels(channels)
         .input_offset(176)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -9730,7 +9730,7 @@
           .channels(channels)
           .input_offset(176)
           .zero_index(mz)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx_mul16, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -9744,7 +9744,7 @@
       .cr(16)
       .kr(25)
       .channels(16)
-      .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx_mul16, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_DWCONV_MINMAX_UP16X25__AVX_MUL16, c_div_16) {
@@ -9754,7 +9754,7 @@
         .cr(16)
         .kr(25)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -9766,7 +9766,7 @@
         .kr(25)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -9778,7 +9778,7 @@
         .kr(25)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -9789,7 +9789,7 @@
         .cr(16)
         .kr(25)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -9800,7 +9800,7 @@
         .cr(16)
         .kr(25)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -9812,7 +9812,7 @@
         .kr(25)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -9824,7 +9824,7 @@
         .kr(25)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -9836,7 +9836,7 @@
         .kr(25)
         .channels(channels)
         .width(3)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -9850,7 +9850,7 @@
           .channels(channels)
           .width(3)
           .step(step)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx_mul16, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -9864,7 +9864,7 @@
         .channels(16)
         .width(5)
         .output_stride(83)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -9877,7 +9877,7 @@
         .channels(channels)
         .width(3)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -9890,7 +9890,7 @@
         .channels(channels)
         .width(3)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -9902,7 +9902,7 @@
         .kr(25)
         .channels(channels)
         .input_offset(304)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -9916,7 +9916,7 @@
           .channels(channels)
           .input_offset(304)
           .zero_index(mz)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx_mul16, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -9930,7 +9930,7 @@
       .cr(24)
       .kr(25)
       .channels(24)
-      .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx_mul16, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_DWCONV_MINMAX_UP24X25__AVX_MUL16, c_div_24) {
@@ -9940,7 +9940,7 @@
         .cr(24)
         .kr(25)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -9952,7 +9952,7 @@
         .kr(25)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -9964,7 +9964,7 @@
         .kr(25)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -9975,7 +9975,7 @@
         .cr(24)
         .kr(25)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -9986,7 +9986,7 @@
         .cr(24)
         .kr(25)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -9998,7 +9998,7 @@
         .kr(25)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -10010,7 +10010,7 @@
         .kr(25)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -10022,7 +10022,7 @@
         .kr(25)
         .channels(channels)
         .width(3)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -10036,7 +10036,7 @@
           .channels(channels)
           .width(3)
           .step(step)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx_mul16, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -10050,7 +10050,7 @@
         .channels(24)
         .width(5)
         .output_stride(127)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -10063,7 +10063,7 @@
         .channels(channels)
         .width(3)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -10076,7 +10076,7 @@
         .channels(channels)
         .width(3)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -10088,7 +10088,7 @@
         .kr(25)
         .channels(channels)
         .input_offset(464)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx_mul16, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -10102,7 +10102,7 @@
           .channels(channels)
           .input_offset(464)
           .zero_index(mz)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx_mul16, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx_mul16, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -10116,7 +10116,7 @@
       .cr(16)
       .kr(25)
       .channels(16)
-      .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx2_mul16, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx2_mul16, xnn_init_qs8_gemm_avx2_params);
   }
 
   TEST(QS8_DWCONV_MINMAX_UP16X25__AVX2_MUL16, c_div_16) {
@@ -10126,7 +10126,7 @@
         .cr(16)
         .kr(25)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx2_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx2_mul16, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -10138,7 +10138,7 @@
         .kr(25)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx2_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx2_mul16, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -10150,7 +10150,7 @@
         .kr(25)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx2_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx2_mul16, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -10161,7 +10161,7 @@
         .cr(16)
         .kr(25)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx2_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx2_mul16, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -10172,7 +10172,7 @@
         .cr(16)
         .kr(25)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx2_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx2_mul16, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -10184,7 +10184,7 @@
         .kr(25)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx2_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx2_mul16, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -10196,7 +10196,7 @@
         .kr(25)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx2_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx2_mul16, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -10208,7 +10208,7 @@
         .kr(25)
         .channels(channels)
         .width(3)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx2_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx2_mul16, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -10222,7 +10222,7 @@
           .channels(channels)
           .width(3)
           .step(step)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx2_mul16, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx2_mul16, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -10236,7 +10236,7 @@
         .channels(16)
         .width(5)
         .output_stride(83)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx2_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx2_mul16, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -10249,7 +10249,7 @@
         .channels(channels)
         .width(3)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx2_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx2_mul16, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -10262,7 +10262,7 @@
         .channels(channels)
         .width(3)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx2_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx2_mul16, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -10274,7 +10274,7 @@
         .kr(25)
         .channels(channels)
         .input_offset(304)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx2_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx2_mul16, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -10288,7 +10288,7 @@
           .channels(channels)
           .input_offset(304)
           .zero_index(mz)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx2_mul16, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx2_mul16, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -10302,7 +10302,7 @@
       .cr(32)
       .kr(25)
       .channels(32)
-      .Test(xnn_qs8_dwconv_minmax_ukernel_up32x25__avx2_mul16, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_dwconv_minmax_ukernel_up32x25__avx2_mul16, xnn_init_qs8_gemm_avx2_params);
   }
 
   TEST(QS8_DWCONV_MINMAX_UP32X25__AVX2_MUL16, c_div_32) {
@@ -10312,7 +10312,7 @@
         .cr(32)
         .kr(25)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x25__avx2_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x25__avx2_mul16, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -10324,7 +10324,7 @@
         .kr(25)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x25__avx2_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x25__avx2_mul16, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -10336,7 +10336,7 @@
         .kr(25)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x25__avx2_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x25__avx2_mul16, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -10347,7 +10347,7 @@
         .cr(32)
         .kr(25)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x25__avx2_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x25__avx2_mul16, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -10358,7 +10358,7 @@
         .cr(32)
         .kr(25)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x25__avx2_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x25__avx2_mul16, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -10370,7 +10370,7 @@
         .kr(25)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x25__avx2_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x25__avx2_mul16, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -10382,7 +10382,7 @@
         .kr(25)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x25__avx2_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x25__avx2_mul16, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -10394,7 +10394,7 @@
         .kr(25)
         .channels(channels)
         .width(3)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x25__avx2_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x25__avx2_mul16, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -10408,7 +10408,7 @@
           .channels(channels)
           .width(3)
           .step(step)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up32x25__avx2_mul16, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up32x25__avx2_mul16, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -10422,7 +10422,7 @@
         .channels(32)
         .width(5)
         .output_stride(163)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x25__avx2_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x25__avx2_mul16, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -10435,7 +10435,7 @@
         .channels(channels)
         .width(3)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x25__avx2_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x25__avx2_mul16, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -10448,7 +10448,7 @@
         .channels(channels)
         .width(3)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x25__avx2_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x25__avx2_mul16, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -10460,7 +10460,7 @@
         .kr(25)
         .channels(channels)
         .input_offset(592)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x25__avx2_mul16, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x25__avx2_mul16, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -10474,7 +10474,7 @@
           .channels(channels)
           .input_offset(592)
           .zero_index(mz)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up32x25__avx2_mul16, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up32x25__avx2_mul16, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -10488,7 +10488,7 @@
       .cr(8)
       .kr(25)
       .channels(8)
-      .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_DWCONV_MINMAX_UP8X25__SSE41_MUL32, c_div_8) {
@@ -10498,7 +10498,7 @@
         .cr(8)
         .kr(25)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -10510,7 +10510,7 @@
         .kr(25)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -10522,7 +10522,7 @@
         .kr(25)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -10533,7 +10533,7 @@
         .cr(8)
         .kr(25)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -10544,7 +10544,7 @@
         .cr(8)
         .kr(25)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -10556,7 +10556,7 @@
         .kr(25)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -10568,7 +10568,7 @@
         .kr(25)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -10580,7 +10580,7 @@
         .kr(25)
         .channels(channels)
         .width(3)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -10594,7 +10594,7 @@
           .channels(channels)
           .width(3)
           .step(step)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -10608,7 +10608,7 @@
         .channels(8)
         .width(5)
         .output_stride(43)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -10621,7 +10621,7 @@
         .channels(channels)
         .width(3)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -10634,7 +10634,7 @@
         .channels(channels)
         .width(3)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -10646,7 +10646,7 @@
         .kr(25)
         .channels(channels)
         .input_offset(176)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -10660,7 +10660,7 @@
           .channels(channels)
           .input_offset(176)
           .zero_index(mz)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -10674,7 +10674,7 @@
       .cr(16)
       .kr(25)
       .channels(16)
-      .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_DWCONV_MINMAX_UP16X25__SSE41_MUL32, c_div_16) {
@@ -10684,7 +10684,7 @@
         .cr(16)
         .kr(25)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -10696,7 +10696,7 @@
         .kr(25)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -10708,7 +10708,7 @@
         .kr(25)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -10719,7 +10719,7 @@
         .cr(16)
         .kr(25)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -10730,7 +10730,7 @@
         .cr(16)
         .kr(25)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -10742,7 +10742,7 @@
         .kr(25)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -10754,7 +10754,7 @@
         .kr(25)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -10766,7 +10766,7 @@
         .kr(25)
         .channels(channels)
         .width(3)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -10780,7 +10780,7 @@
           .channels(channels)
           .width(3)
           .step(step)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -10794,7 +10794,7 @@
         .channels(16)
         .width(5)
         .output_stride(83)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -10807,7 +10807,7 @@
         .channels(channels)
         .width(3)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -10820,7 +10820,7 @@
         .channels(channels)
         .width(3)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -10832,7 +10832,7 @@
         .kr(25)
         .channels(channels)
         .input_offset(304)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -10846,7 +10846,7 @@
           .channels(channels)
           .input_offset(304)
           .zero_index(mz)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -10860,7 +10860,7 @@
       .cr(24)
       .kr(25)
       .channels(24)
-      .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_DWCONV_MINMAX_UP24X25__SSE41_MUL32, c_div_24) {
@@ -10870,7 +10870,7 @@
         .cr(24)
         .kr(25)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -10882,7 +10882,7 @@
         .kr(25)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -10894,7 +10894,7 @@
         .kr(25)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -10905,7 +10905,7 @@
         .cr(24)
         .kr(25)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -10916,7 +10916,7 @@
         .cr(24)
         .kr(25)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -10928,7 +10928,7 @@
         .kr(25)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -10940,7 +10940,7 @@
         .kr(25)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -10952,7 +10952,7 @@
         .kr(25)
         .channels(channels)
         .width(3)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -10966,7 +10966,7 @@
           .channels(channels)
           .width(3)
           .step(step)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -10980,7 +10980,7 @@
         .channels(24)
         .width(5)
         .output_stride(127)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -10993,7 +10993,7 @@
         .channels(channels)
         .width(3)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -11006,7 +11006,7 @@
         .channels(channels)
         .width(3)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -11018,7 +11018,7 @@
         .kr(25)
         .channels(channels)
         .input_offset(464)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -11032,7 +11032,7 @@
           .channels(channels)
           .input_offset(464)
           .zero_index(mz)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__sse41_mul32, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__sse41_mul32, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -11046,7 +11046,7 @@
       .cr(8)
       .kr(25)
       .channels(8)
-      .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx_mul32, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_DWCONV_MINMAX_UP8X25__AVX_MUL32, c_div_8) {
@@ -11056,7 +11056,7 @@
         .cr(8)
         .kr(25)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -11068,7 +11068,7 @@
         .kr(25)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -11080,7 +11080,7 @@
         .kr(25)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -11091,7 +11091,7 @@
         .cr(8)
         .kr(25)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -11102,7 +11102,7 @@
         .cr(8)
         .kr(25)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -11114,7 +11114,7 @@
         .kr(25)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -11126,7 +11126,7 @@
         .kr(25)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -11138,7 +11138,7 @@
         .kr(25)
         .channels(channels)
         .width(3)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -11152,7 +11152,7 @@
           .channels(channels)
           .width(3)
           .step(step)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx_mul32, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -11166,7 +11166,7 @@
         .channels(8)
         .width(5)
         .output_stride(43)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -11179,7 +11179,7 @@
         .channels(channels)
         .width(3)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -11192,7 +11192,7 @@
         .channels(channels)
         .width(3)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -11204,7 +11204,7 @@
         .kr(25)
         .channels(channels)
         .input_offset(176)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -11218,7 +11218,7 @@
           .channels(channels)
           .input_offset(176)
           .zero_index(mz)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx_mul32, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -11232,7 +11232,7 @@
       .cr(16)
       .kr(25)
       .channels(16)
-      .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx_mul32, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_DWCONV_MINMAX_UP16X25__AVX_MUL32, c_div_16) {
@@ -11242,7 +11242,7 @@
         .cr(16)
         .kr(25)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -11254,7 +11254,7 @@
         .kr(25)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -11266,7 +11266,7 @@
         .kr(25)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -11277,7 +11277,7 @@
         .cr(16)
         .kr(25)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -11288,7 +11288,7 @@
         .cr(16)
         .kr(25)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -11300,7 +11300,7 @@
         .kr(25)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -11312,7 +11312,7 @@
         .kr(25)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -11324,7 +11324,7 @@
         .kr(25)
         .channels(channels)
         .width(3)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -11338,7 +11338,7 @@
           .channels(channels)
           .width(3)
           .step(step)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx_mul32, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -11352,7 +11352,7 @@
         .channels(16)
         .width(5)
         .output_stride(83)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -11365,7 +11365,7 @@
         .channels(channels)
         .width(3)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -11378,7 +11378,7 @@
         .channels(channels)
         .width(3)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -11390,7 +11390,7 @@
         .kr(25)
         .channels(channels)
         .input_offset(304)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -11404,7 +11404,7 @@
           .channels(channels)
           .input_offset(304)
           .zero_index(mz)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx_mul32, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -11418,7 +11418,7 @@
       .cr(24)
       .kr(25)
       .channels(24)
-      .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx_mul32, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_DWCONV_MINMAX_UP24X25__AVX_MUL32, c_div_24) {
@@ -11428,7 +11428,7 @@
         .cr(24)
         .kr(25)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -11440,7 +11440,7 @@
         .kr(25)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -11452,7 +11452,7 @@
         .kr(25)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -11463,7 +11463,7 @@
         .cr(24)
         .kr(25)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -11474,7 +11474,7 @@
         .cr(24)
         .kr(25)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -11486,7 +11486,7 @@
         .kr(25)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -11498,7 +11498,7 @@
         .kr(25)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -11510,7 +11510,7 @@
         .kr(25)
         .channels(channels)
         .width(3)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -11524,7 +11524,7 @@
           .channels(channels)
           .width(3)
           .step(step)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx_mul32, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -11538,7 +11538,7 @@
         .channels(24)
         .width(5)
         .output_stride(127)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -11551,7 +11551,7 @@
         .channels(channels)
         .width(3)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -11564,7 +11564,7 @@
         .channels(channels)
         .width(3)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -11576,7 +11576,7 @@
         .kr(25)
         .channels(channels)
         .input_offset(464)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -11590,7 +11590,7 @@
           .channels(channels)
           .input_offset(464)
           .zero_index(mz)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx_mul32, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx_mul32, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -11604,7 +11604,7 @@
       .cr(8)
       .kr(25)
       .channels(8)
-      .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__xop_mul32, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_DWCONV_MINMAX_UP8X25__XOP_MUL32, c_div_8) {
@@ -11614,7 +11614,7 @@
         .cr(8)
         .kr(25)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__xop_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -11626,7 +11626,7 @@
         .kr(25)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__xop_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -11638,7 +11638,7 @@
         .kr(25)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__xop_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -11649,7 +11649,7 @@
         .cr(8)
         .kr(25)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__xop_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -11660,7 +11660,7 @@
         .cr(8)
         .kr(25)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__xop_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -11672,7 +11672,7 @@
         .kr(25)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__xop_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -11684,7 +11684,7 @@
         .kr(25)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__xop_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -11696,7 +11696,7 @@
         .kr(25)
         .channels(channels)
         .width(3)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__xop_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -11710,7 +11710,7 @@
           .channels(channels)
           .width(3)
           .step(step)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__xop_mul32, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -11724,7 +11724,7 @@
         .channels(8)
         .width(5)
         .output_stride(43)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__xop_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -11737,7 +11737,7 @@
         .channels(channels)
         .width(3)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__xop_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -11750,7 +11750,7 @@
         .channels(channels)
         .width(3)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__xop_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -11762,7 +11762,7 @@
         .kr(25)
         .channels(channels)
         .input_offset(176)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__xop_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -11776,7 +11776,7 @@
           .channels(channels)
           .input_offset(176)
           .zero_index(mz)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__xop_mul32, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -11790,7 +11790,7 @@
       .cr(16)
       .kr(25)
       .channels(16)
-      .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__xop_mul32, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_DWCONV_MINMAX_UP16X25__XOP_MUL32, c_div_16) {
@@ -11800,7 +11800,7 @@
         .cr(16)
         .kr(25)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__xop_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -11812,7 +11812,7 @@
         .kr(25)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__xop_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -11824,7 +11824,7 @@
         .kr(25)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__xop_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -11835,7 +11835,7 @@
         .cr(16)
         .kr(25)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__xop_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -11846,7 +11846,7 @@
         .cr(16)
         .kr(25)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__xop_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -11858,7 +11858,7 @@
         .kr(25)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__xop_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -11870,7 +11870,7 @@
         .kr(25)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__xop_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -11882,7 +11882,7 @@
         .kr(25)
         .channels(channels)
         .width(3)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__xop_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -11896,7 +11896,7 @@
           .channels(channels)
           .width(3)
           .step(step)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__xop_mul32, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -11910,7 +11910,7 @@
         .channels(16)
         .width(5)
         .output_stride(83)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__xop_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -11923,7 +11923,7 @@
         .channels(channels)
         .width(3)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__xop_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -11936,7 +11936,7 @@
         .channels(channels)
         .width(3)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__xop_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -11948,7 +11948,7 @@
         .kr(25)
         .channels(channels)
         .input_offset(304)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__xop_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -11962,7 +11962,7 @@
           .channels(channels)
           .input_offset(304)
           .zero_index(mz)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__xop_mul32, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -11976,7 +11976,7 @@
       .cr(24)
       .kr(25)
       .channels(24)
-      .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__xop_mul32, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_DWCONV_MINMAX_UP24X25__XOP_MUL32, c_div_24) {
@@ -11986,7 +11986,7 @@
         .cr(24)
         .kr(25)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__xop_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -11998,7 +11998,7 @@
         .kr(25)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__xop_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -12010,7 +12010,7 @@
         .kr(25)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__xop_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -12021,7 +12021,7 @@
         .cr(24)
         .kr(25)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__xop_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -12032,7 +12032,7 @@
         .cr(24)
         .kr(25)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__xop_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -12044,7 +12044,7 @@
         .kr(25)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__xop_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -12056,7 +12056,7 @@
         .kr(25)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__xop_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -12068,7 +12068,7 @@
         .kr(25)
         .channels(channels)
         .width(3)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__xop_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -12082,7 +12082,7 @@
           .channels(channels)
           .width(3)
           .step(step)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__xop_mul32, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -12096,7 +12096,7 @@
         .channels(24)
         .width(5)
         .output_stride(127)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__xop_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -12109,7 +12109,7 @@
         .channels(channels)
         .width(3)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__xop_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -12122,7 +12122,7 @@
         .channels(channels)
         .width(3)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__xop_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -12134,7 +12134,7 @@
         .kr(25)
         .channels(channels)
         .input_offset(464)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__xop_mul32, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -12148,7 +12148,7 @@
           .channels(channels)
           .input_offset(464)
           .zero_index(mz)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__xop_mul32, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__xop_mul32, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -12162,7 +12162,7 @@
       .cr(8)
       .kr(25)
       .channels(8)
-      .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
   }
 
   TEST(QS8_DWCONV_MINMAX_UP8X25__AVX2_MUL32, c_div_8) {
@@ -12172,7 +12172,7 @@
         .cr(8)
         .kr(25)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -12184,7 +12184,7 @@
         .kr(25)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -12196,7 +12196,7 @@
         .kr(25)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -12207,7 +12207,7 @@
         .cr(8)
         .kr(25)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -12218,7 +12218,7 @@
         .cr(8)
         .kr(25)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -12230,7 +12230,7 @@
         .kr(25)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -12242,7 +12242,7 @@
         .kr(25)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -12254,7 +12254,7 @@
         .kr(25)
         .channels(channels)
         .width(3)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -12268,7 +12268,7 @@
           .channels(channels)
           .width(3)
           .step(step)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -12282,7 +12282,7 @@
         .channels(8)
         .width(5)
         .output_stride(43)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -12295,7 +12295,7 @@
         .channels(channels)
         .width(3)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -12308,7 +12308,7 @@
         .channels(channels)
         .width(3)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -12320,7 +12320,7 @@
         .kr(25)
         .channels(channels)
         .input_offset(176)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -12334,7 +12334,7 @@
           .channels(channels)
           .input_offset(176)
           .zero_index(mz)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up8x25__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -12348,7 +12348,7 @@
       .cr(16)
       .kr(25)
       .channels(16)
-      .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
   }
 
   TEST(QS8_DWCONV_MINMAX_UP16X25__AVX2_MUL32, c_div_16) {
@@ -12358,7 +12358,7 @@
         .cr(16)
         .kr(25)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -12370,7 +12370,7 @@
         .kr(25)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -12382,7 +12382,7 @@
         .kr(25)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -12393,7 +12393,7 @@
         .cr(16)
         .kr(25)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -12404,7 +12404,7 @@
         .cr(16)
         .kr(25)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -12416,7 +12416,7 @@
         .kr(25)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -12428,7 +12428,7 @@
         .kr(25)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -12440,7 +12440,7 @@
         .kr(25)
         .channels(channels)
         .width(3)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -12454,7 +12454,7 @@
           .channels(channels)
           .width(3)
           .step(step)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -12468,7 +12468,7 @@
         .channels(16)
         .width(5)
         .output_stride(83)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -12481,7 +12481,7 @@
         .channels(channels)
         .width(3)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -12494,7 +12494,7 @@
         .channels(channels)
         .width(3)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -12506,7 +12506,7 @@
         .kr(25)
         .channels(channels)
         .input_offset(304)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -12520,7 +12520,7 @@
           .channels(channels)
           .input_offset(304)
           .zero_index(mz)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up16x25__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -12534,7 +12534,7 @@
       .cr(24)
       .kr(25)
       .channels(24)
-      .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
   }
 
   TEST(QS8_DWCONV_MINMAX_UP24X25__AVX2_MUL32, c_div_24) {
@@ -12544,7 +12544,7 @@
         .cr(24)
         .kr(25)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -12556,7 +12556,7 @@
         .kr(25)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -12568,7 +12568,7 @@
         .kr(25)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -12579,7 +12579,7 @@
         .cr(24)
         .kr(25)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -12590,7 +12590,7 @@
         .cr(24)
         .kr(25)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -12602,7 +12602,7 @@
         .kr(25)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -12614,7 +12614,7 @@
         .kr(25)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -12626,7 +12626,7 @@
         .kr(25)
         .channels(channels)
         .width(3)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -12640,7 +12640,7 @@
           .channels(channels)
           .width(3)
           .step(step)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -12654,7 +12654,7 @@
         .channels(24)
         .width(5)
         .output_stride(127)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -12667,7 +12667,7 @@
         .channels(channels)
         .width(3)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -12680,7 +12680,7 @@
         .channels(channels)
         .width(3)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -12692,7 +12692,7 @@
         .kr(25)
         .channels(channels)
         .input_offset(464)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -12706,7 +12706,7 @@
           .channels(channels)
           .input_offset(464)
           .zero_index(mz)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up24x25__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -12720,7 +12720,7 @@
       .cr(32)
       .kr(25)
       .channels(32)
-      .Test(xnn_qs8_dwconv_minmax_ukernel_up32x25__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_dwconv_minmax_ukernel_up32x25__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
   }
 
   TEST(QS8_DWCONV_MINMAX_UP32X25__AVX2_MUL32, c_div_32) {
@@ -12730,7 +12730,7 @@
         .cr(32)
         .kr(25)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x25__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x25__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -12742,7 +12742,7 @@
         .kr(25)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x25__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x25__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -12754,7 +12754,7 @@
         .kr(25)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x25__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x25__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -12765,7 +12765,7 @@
         .cr(32)
         .kr(25)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x25__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x25__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -12776,7 +12776,7 @@
         .cr(32)
         .kr(25)
         .channels(channels)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x25__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x25__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -12788,7 +12788,7 @@
         .kr(25)
         .channels(channels)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x25__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x25__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -12800,7 +12800,7 @@
         .kr(25)
         .channels(channels)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x25__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x25__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -12812,7 +12812,7 @@
         .kr(25)
         .channels(channels)
         .width(3)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x25__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x25__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -12826,7 +12826,7 @@
           .channels(channels)
           .width(3)
           .step(step)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up32x25__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up32x25__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -12840,7 +12840,7 @@
         .channels(32)
         .width(5)
         .output_stride(163)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x25__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x25__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -12853,7 +12853,7 @@
         .channels(channels)
         .width(3)
         .qmin(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x25__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x25__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -12866,7 +12866,7 @@
         .channels(channels)
         .width(3)
         .qmax(128)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x25__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x25__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -12878,7 +12878,7 @@
         .kr(25)
         .channels(channels)
         .input_offset(592)
-        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x25__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_dwconv_minmax_ukernel_up32x25__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -12892,7 +12892,7 @@
           .channels(channels)
           .input_offset(592)
           .zero_index(mz)
-          .Test(xnn_qs8_dwconv_minmax_ukernel_up32x25__avx2_mul32, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_dwconv_minmax_ukernel_up32x25__avx2_mul32, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
diff --git a/test/qs8-dwconv-minmax.yaml b/test/qs8-dwconv-minmax.yaml
index 4c99dac..3007ce0 100644
--- a/test/qs8-dwconv-minmax.yaml
+++ b/test/qs8-dwconv-minmax.yaml
@@ -23,47 +23,47 @@
 - name: xnn_qs8_dwconv_minmax_ukernel_up24x9__ssse3_mul16
   init: xnn_init_qs8_gemm_sse2_params
 - name: xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul16
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
 - name: xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
 - name: xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
 - name: xnn_qs8_dwconv_minmax_ukernel_up8x9__avx_mul16
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
 - name: xnn_qs8_dwconv_minmax_ukernel_up16x9__avx_mul16
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
 - name: xnn_qs8_dwconv_minmax_ukernel_up24x9__avx_mul16
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
 - name: xnn_qs8_dwconv_minmax_ukernel_up16x9__avx2_mul16
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_avx2_params
 - name: xnn_qs8_dwconv_minmax_ukernel_up32x9__avx2_mul16
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_avx2_params
 - name: xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul32
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
 - name: xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul32
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
 - name: xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul32
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
 - name: xnn_qs8_dwconv_minmax_ukernel_up8x9__avx_mul32
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
 - name: xnn_qs8_dwconv_minmax_ukernel_up16x9__avx_mul32
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
 - name: xnn_qs8_dwconv_minmax_ukernel_up24x9__avx_mul32
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
 - name: xnn_qs8_dwconv_minmax_ukernel_up8x9__xop_mul32
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
 - name: xnn_qs8_dwconv_minmax_ukernel_up16x9__xop_mul32
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
 - name: xnn_qs8_dwconv_minmax_ukernel_up24x9__xop_mul32
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
 - name: xnn_qs8_dwconv_minmax_ukernel_up8x9__avx2_mul32
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_avx2_params
 - name: xnn_qs8_dwconv_minmax_ukernel_up16x9__avx2_mul32
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_avx2_params
 - name: xnn_qs8_dwconv_minmax_ukernel_up24x9__avx2_mul32
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_avx2_params
 - name: xnn_qs8_dwconv_minmax_ukernel_up32x9__avx2_mul32
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_avx2_params
 - name: xnn_qs8_dwconv_minmax_ukernel_up16x9__avx512skx_mul32
   init: xnn_init_qs8_gemm_sse2_params
 - name: xnn_qs8_dwconv_minmax_ukernel_up32x9__avx512skx_mul32
@@ -101,47 +101,47 @@
 - name: xnn_qs8_dwconv_minmax_ukernel_up24x25__ssse3_mul16
   init: xnn_init_qs8_gemm_sse2_params
 - name: xnn_qs8_dwconv_minmax_ukernel_up8x25__sse41_mul16
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
 - name: xnn_qs8_dwconv_minmax_ukernel_up16x25__sse41_mul16
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
 - name: xnn_qs8_dwconv_minmax_ukernel_up24x25__sse41_mul16
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
 - name: xnn_qs8_dwconv_minmax_ukernel_up8x25__avx_mul16
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
 - name: xnn_qs8_dwconv_minmax_ukernel_up16x25__avx_mul16
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
 - name: xnn_qs8_dwconv_minmax_ukernel_up24x25__avx_mul16
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
 - name: xnn_qs8_dwconv_minmax_ukernel_up16x25__avx2_mul16
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_avx2_params
 - name: xnn_qs8_dwconv_minmax_ukernel_up32x25__avx2_mul16
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_avx2_params
 - name: xnn_qs8_dwconv_minmax_ukernel_up8x25__sse41_mul32
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
 - name: xnn_qs8_dwconv_minmax_ukernel_up16x25__sse41_mul32
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
 - name: xnn_qs8_dwconv_minmax_ukernel_up24x25__sse41_mul32
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
 - name: xnn_qs8_dwconv_minmax_ukernel_up8x25__avx_mul32
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
 - name: xnn_qs8_dwconv_minmax_ukernel_up16x25__avx_mul32
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
 - name: xnn_qs8_dwconv_minmax_ukernel_up24x25__avx_mul32
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
 - name: xnn_qs8_dwconv_minmax_ukernel_up8x25__xop_mul32
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
 - name: xnn_qs8_dwconv_minmax_ukernel_up16x25__xop_mul32
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
 - name: xnn_qs8_dwconv_minmax_ukernel_up24x25__xop_mul32
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
 - name: xnn_qs8_dwconv_minmax_ukernel_up8x25__avx2_mul32
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_avx2_params
 - name: xnn_qs8_dwconv_minmax_ukernel_up16x25__avx2_mul32
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_avx2_params
 - name: xnn_qs8_dwconv_minmax_ukernel_up24x25__avx2_mul32
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_avx2_params
 - name: xnn_qs8_dwconv_minmax_ukernel_up32x25__avx2_mul32
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_avx2_params
 - name: xnn_qs8_dwconv_minmax_ukernel_up16x25__avx512skx_mul32
   init: xnn_init_qs8_gemm_sse2_params
 - name: xnn_qs8_dwconv_minmax_ukernel_up32x25__avx512skx_mul32
diff --git a/test/qs8-gemm-minmax.cc b/test/qs8-gemm-minmax.cc
index c84d4d0..4f56544 100644
--- a/test/qs8-gemm-minmax.cc
+++ b/test/qs8-gemm-minmax.cc
@@ -46089,7 +46089,7 @@
       .m(1)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_1X4C2__SSE41_LD64, strided_cn) {
@@ -46103,7 +46103,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_1X4C2__SSE41_LD64, k_eq_8_strided_a) {
@@ -46117,7 +46117,7 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_1X4C2__SSE41_LD64, k_eq_8_subtile) {
@@ -46133,7 +46133,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -46150,7 +46150,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -46166,7 +46166,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -46181,7 +46181,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -46197,7 +46197,7 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -46215,7 +46215,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -46232,7 +46232,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -46248,7 +46248,7 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -46266,7 +46266,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -46283,7 +46283,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -46299,7 +46299,7 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -46317,7 +46317,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -46335,7 +46335,7 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -46353,7 +46353,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -46371,7 +46371,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -46390,7 +46390,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -46408,7 +46408,7 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -46426,7 +46426,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -46444,7 +46444,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -46463,7 +46463,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -46484,7 +46484,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -46501,7 +46501,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_1X4C2__SSE41_LD64, qmax) {
@@ -46515,7 +46515,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_1X4C2__SSE41_LD64, strided_cm) {
@@ -46529,7 +46529,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -46545,7 +46545,7 @@
       .m(2)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_2X4C2__SSE41_LD64, strided_cn) {
@@ -46559,7 +46559,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_2X4C2__SSE41_LD64, k_eq_8_strided_a) {
@@ -46573,7 +46573,7 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_2X4C2__SSE41_LD64, k_eq_8_subtile) {
@@ -46589,7 +46589,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -46606,7 +46606,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -46622,7 +46622,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -46637,7 +46637,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -46653,7 +46653,7 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -46671,7 +46671,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -46688,7 +46688,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -46704,7 +46704,7 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -46722,7 +46722,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -46739,7 +46739,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -46755,7 +46755,7 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -46773,7 +46773,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -46791,7 +46791,7 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -46809,7 +46809,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -46827,7 +46827,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -46846,7 +46846,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -46864,7 +46864,7 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -46882,7 +46882,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -46900,7 +46900,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -46919,7 +46919,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -46940,7 +46940,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -46957,7 +46957,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_2X4C2__SSE41_LD64, qmax) {
@@ -46971,7 +46971,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_2X4C2__SSE41_LD64, strided_cm) {
@@ -46985,7 +46985,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -47001,7 +47001,7 @@
       .m(3)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_3X4C2__SSE41_LD64, strided_cn) {
@@ -47015,7 +47015,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_3X4C2__SSE41_LD64, k_eq_8_strided_a) {
@@ -47029,7 +47029,7 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_3X4C2__SSE41_LD64, k_eq_8_subtile) {
@@ -47045,7 +47045,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -47062,7 +47062,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -47078,7 +47078,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -47093,7 +47093,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -47109,7 +47109,7 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -47127,7 +47127,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -47144,7 +47144,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -47160,7 +47160,7 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -47178,7 +47178,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -47195,7 +47195,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -47211,7 +47211,7 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -47229,7 +47229,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -47247,7 +47247,7 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -47265,7 +47265,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -47283,7 +47283,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -47302,7 +47302,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -47320,7 +47320,7 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -47338,7 +47338,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -47356,7 +47356,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -47375,7 +47375,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -47396,7 +47396,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -47413,7 +47413,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_3X4C2__SSE41_LD64, qmax) {
@@ -47427,7 +47427,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_3X4C2__SSE41_LD64, strided_cm) {
@@ -47441,7 +47441,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -47457,7 +47457,7 @@
       .m(4)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_4X4C2__SSE41_LD64, strided_cn) {
@@ -47471,7 +47471,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_4X4C2__SSE41_LD64, k_eq_8_strided_a) {
@@ -47485,7 +47485,7 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_4X4C2__SSE41_LD64, k_eq_8_subtile) {
@@ -47501,7 +47501,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -47518,7 +47518,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -47534,7 +47534,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -47549,7 +47549,7 @@
         .m(4)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -47565,7 +47565,7 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -47583,7 +47583,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -47600,7 +47600,7 @@
         .m(4)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -47616,7 +47616,7 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -47634,7 +47634,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -47651,7 +47651,7 @@
         .m(4)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -47667,7 +47667,7 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -47685,7 +47685,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -47703,7 +47703,7 @@
           .m(4)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -47721,7 +47721,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -47739,7 +47739,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -47758,7 +47758,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -47776,7 +47776,7 @@
           .m(4)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -47794,7 +47794,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -47812,7 +47812,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -47831,7 +47831,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -47852,7 +47852,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -47869,7 +47869,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_4X4C2__SSE41_LD64, qmax) {
@@ -47883,7 +47883,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_4X4C2__SSE41_LD64, strided_cm) {
@@ -47897,7 +47897,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -47913,7 +47913,7 @@
       .m(1)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_1X4C2__AVX_LD64, strided_cn) {
@@ -47927,7 +47927,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_1X4C2__AVX_LD64, k_eq_8_strided_a) {
@@ -47941,7 +47941,7 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_1X4C2__AVX_LD64, k_eq_8_subtile) {
@@ -47957,7 +47957,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -47974,7 +47974,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -47990,7 +47990,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -48005,7 +48005,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -48021,7 +48021,7 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -48039,7 +48039,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -48056,7 +48056,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -48072,7 +48072,7 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -48090,7 +48090,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -48107,7 +48107,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -48123,7 +48123,7 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -48141,7 +48141,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -48159,7 +48159,7 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -48177,7 +48177,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -48195,7 +48195,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -48214,7 +48214,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -48232,7 +48232,7 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -48250,7 +48250,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -48268,7 +48268,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -48287,7 +48287,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -48308,7 +48308,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -48325,7 +48325,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_1X4C2__AVX_LD64, qmax) {
@@ -48339,7 +48339,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_1X4C2__AVX_LD64, strided_cm) {
@@ -48353,7 +48353,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -48369,7 +48369,7 @@
       .m(2)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_2X4C2__AVX_LD64, strided_cn) {
@@ -48383,7 +48383,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_2X4C2__AVX_LD64, k_eq_8_strided_a) {
@@ -48397,7 +48397,7 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_2X4C2__AVX_LD64, k_eq_8_subtile) {
@@ -48413,7 +48413,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -48430,7 +48430,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -48446,7 +48446,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -48461,7 +48461,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -48477,7 +48477,7 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -48495,7 +48495,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -48512,7 +48512,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -48528,7 +48528,7 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -48546,7 +48546,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -48563,7 +48563,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -48579,7 +48579,7 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -48597,7 +48597,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -48615,7 +48615,7 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -48633,7 +48633,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -48651,7 +48651,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -48670,7 +48670,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -48688,7 +48688,7 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -48706,7 +48706,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -48724,7 +48724,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -48743,7 +48743,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -48764,7 +48764,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -48781,7 +48781,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_2X4C2__AVX_LD64, qmax) {
@@ -48795,7 +48795,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_2X4C2__AVX_LD64, strided_cm) {
@@ -48809,7 +48809,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -48825,7 +48825,7 @@
       .m(3)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_3X4C2__AVX_LD64, strided_cn) {
@@ -48839,7 +48839,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_3X4C2__AVX_LD64, k_eq_8_strided_a) {
@@ -48853,7 +48853,7 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_3X4C2__AVX_LD64, k_eq_8_subtile) {
@@ -48869,7 +48869,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -48886,7 +48886,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -48902,7 +48902,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -48917,7 +48917,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -48933,7 +48933,7 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -48951,7 +48951,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -48968,7 +48968,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -48984,7 +48984,7 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -49002,7 +49002,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -49019,7 +49019,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -49035,7 +49035,7 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -49053,7 +49053,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -49071,7 +49071,7 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -49089,7 +49089,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -49107,7 +49107,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -49126,7 +49126,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -49144,7 +49144,7 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -49162,7 +49162,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -49180,7 +49180,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -49199,7 +49199,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -49220,7 +49220,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -49237,7 +49237,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_3X4C2__AVX_LD64, qmax) {
@@ -49251,7 +49251,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_3X4C2__AVX_LD64, strided_cm) {
@@ -49265,7 +49265,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -49281,7 +49281,7 @@
       .m(4)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_4X4C2__AVX_LD64, strided_cn) {
@@ -49295,7 +49295,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_4X4C2__AVX_LD64, k_eq_8_strided_a) {
@@ -49309,7 +49309,7 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_4X4C2__AVX_LD64, k_eq_8_subtile) {
@@ -49325,7 +49325,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -49342,7 +49342,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -49358,7 +49358,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -49373,7 +49373,7 @@
         .m(4)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -49389,7 +49389,7 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -49407,7 +49407,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -49424,7 +49424,7 @@
         .m(4)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -49440,7 +49440,7 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -49458,7 +49458,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -49475,7 +49475,7 @@
         .m(4)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -49491,7 +49491,7 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -49509,7 +49509,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -49527,7 +49527,7 @@
           .m(4)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -49545,7 +49545,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -49563,7 +49563,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -49582,7 +49582,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -49600,7 +49600,7 @@
           .m(4)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -49618,7 +49618,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -49636,7 +49636,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -49655,7 +49655,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -49676,7 +49676,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -49693,7 +49693,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_4X4C2__AVX_LD64, qmax) {
@@ -49707,7 +49707,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_4X4C2__AVX_LD64, strided_cm) {
@@ -49721,7 +49721,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -49737,7 +49737,7 @@
       .m(1)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_1X4C2__XOP_LD64, strided_cn) {
@@ -49751,7 +49751,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_1X4C2__XOP_LD64, k_eq_8_strided_a) {
@@ -49765,7 +49765,7 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_1X4C2__XOP_LD64, k_eq_8_subtile) {
@@ -49781,7 +49781,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -49798,7 +49798,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -49814,7 +49814,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -49829,7 +49829,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -49845,7 +49845,7 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -49863,7 +49863,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -49880,7 +49880,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -49896,7 +49896,7 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -49914,7 +49914,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -49931,7 +49931,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -49947,7 +49947,7 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -49965,7 +49965,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -49983,7 +49983,7 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -50001,7 +50001,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -50019,7 +50019,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -50038,7 +50038,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -50056,7 +50056,7 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -50074,7 +50074,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -50092,7 +50092,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -50111,7 +50111,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -50132,7 +50132,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -50149,7 +50149,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_1X4C2__XOP_LD64, qmax) {
@@ -50163,7 +50163,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_1X4C2__XOP_LD64, strided_cm) {
@@ -50177,7 +50177,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -50193,7 +50193,7 @@
       .m(2)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_2X4C2__XOP_LD64, strided_cn) {
@@ -50207,7 +50207,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_2X4C2__XOP_LD64, k_eq_8_strided_a) {
@@ -50221,7 +50221,7 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_2X4C2__XOP_LD64, k_eq_8_subtile) {
@@ -50237,7 +50237,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -50254,7 +50254,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -50270,7 +50270,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -50285,7 +50285,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -50301,7 +50301,7 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -50319,7 +50319,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -50336,7 +50336,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -50352,7 +50352,7 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -50370,7 +50370,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -50387,7 +50387,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -50403,7 +50403,7 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -50421,7 +50421,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -50439,7 +50439,7 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -50457,7 +50457,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -50475,7 +50475,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -50494,7 +50494,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -50512,7 +50512,7 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -50530,7 +50530,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -50548,7 +50548,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -50567,7 +50567,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -50588,7 +50588,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -50605,7 +50605,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_2X4C2__XOP_LD64, qmax) {
@@ -50619,7 +50619,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_2X4C2__XOP_LD64, strided_cm) {
@@ -50633,7 +50633,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -50649,7 +50649,7 @@
       .m(3)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_3X4C2__XOP_LD64, strided_cn) {
@@ -50663,7 +50663,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_3X4C2__XOP_LD64, k_eq_8_strided_a) {
@@ -50677,7 +50677,7 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_3X4C2__XOP_LD64, k_eq_8_subtile) {
@@ -50693,7 +50693,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -50710,7 +50710,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -50726,7 +50726,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -50741,7 +50741,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -50757,7 +50757,7 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -50775,7 +50775,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -50792,7 +50792,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -50808,7 +50808,7 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -50826,7 +50826,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -50843,7 +50843,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -50859,7 +50859,7 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -50877,7 +50877,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -50895,7 +50895,7 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -50913,7 +50913,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -50931,7 +50931,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -50950,7 +50950,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -50968,7 +50968,7 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -50986,7 +50986,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -51004,7 +51004,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -51023,7 +51023,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -51044,7 +51044,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -51061,7 +51061,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_3X4C2__XOP_LD64, qmax) {
@@ -51075,7 +51075,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_3X4C2__XOP_LD64, strided_cm) {
@@ -51089,7 +51089,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -51105,7 +51105,7 @@
       .m(4)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_4X4C2__XOP_LD64, strided_cn) {
@@ -51119,7 +51119,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_4X4C2__XOP_LD64, k_eq_8_strided_a) {
@@ -51133,7 +51133,7 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_4X4C2__XOP_LD64, k_eq_8_subtile) {
@@ -51149,7 +51149,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -51166,7 +51166,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -51182,7 +51182,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -51197,7 +51197,7 @@
         .m(4)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -51213,7 +51213,7 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -51231,7 +51231,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -51248,7 +51248,7 @@
         .m(4)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -51264,7 +51264,7 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -51282,7 +51282,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -51299,7 +51299,7 @@
         .m(4)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -51315,7 +51315,7 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -51333,7 +51333,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -51351,7 +51351,7 @@
           .m(4)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -51369,7 +51369,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -51387,7 +51387,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -51406,7 +51406,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -51424,7 +51424,7 @@
           .m(4)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -51442,7 +51442,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -51460,7 +51460,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -51479,7 +51479,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -51500,7 +51500,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -51517,7 +51517,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_4X4C2__XOP_LD64, qmax) {
@@ -51531,7 +51531,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_4X4C2__XOP_LD64, strided_cm) {
@@ -51545,7 +51545,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -55209,7 +55209,7 @@
       .m(1)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_1X4C2__SSE41_LD128, strided_cn) {
@@ -55223,7 +55223,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_1X4C2__SSE41_LD128, k_eq_8_strided_a) {
@@ -55237,7 +55237,7 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_1X4C2__SSE41_LD128, k_eq_8_subtile) {
@@ -55253,7 +55253,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -55270,7 +55270,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -55286,7 +55286,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -55301,7 +55301,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -55317,7 +55317,7 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -55335,7 +55335,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -55352,7 +55352,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -55368,7 +55368,7 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -55386,7 +55386,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -55403,7 +55403,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -55419,7 +55419,7 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -55437,7 +55437,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -55455,7 +55455,7 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -55473,7 +55473,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -55491,7 +55491,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -55510,7 +55510,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -55528,7 +55528,7 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -55546,7 +55546,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -55564,7 +55564,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -55583,7 +55583,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -55604,7 +55604,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -55621,7 +55621,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_1X4C2__SSE41_LD128, qmax) {
@@ -55635,7 +55635,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_1X4C2__SSE41_LD128, strided_cm) {
@@ -55649,7 +55649,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -55665,7 +55665,7 @@
       .m(2)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_2X4C2__SSE41_LD128, strided_cn) {
@@ -55679,7 +55679,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_2X4C2__SSE41_LD128, k_eq_8_strided_a) {
@@ -55693,7 +55693,7 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_2X4C2__SSE41_LD128, k_eq_8_subtile) {
@@ -55709,7 +55709,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -55726,7 +55726,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -55742,7 +55742,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -55757,7 +55757,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -55773,7 +55773,7 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -55791,7 +55791,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -55808,7 +55808,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -55824,7 +55824,7 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -55842,7 +55842,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -55859,7 +55859,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -55875,7 +55875,7 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -55893,7 +55893,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -55911,7 +55911,7 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -55929,7 +55929,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -55947,7 +55947,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -55966,7 +55966,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -55984,7 +55984,7 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -56002,7 +56002,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -56020,7 +56020,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -56039,7 +56039,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -56060,7 +56060,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -56077,7 +56077,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_2X4C2__SSE41_LD128, qmax) {
@@ -56091,7 +56091,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_2X4C2__SSE41_LD128, strided_cm) {
@@ -56105,7 +56105,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -56121,7 +56121,7 @@
       .m(3)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_3X4C2__SSE41_LD128, strided_cn) {
@@ -56135,7 +56135,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_3X4C2__SSE41_LD128, k_eq_8_strided_a) {
@@ -56149,7 +56149,7 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_3X4C2__SSE41_LD128, k_eq_8_subtile) {
@@ -56165,7 +56165,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -56182,7 +56182,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -56198,7 +56198,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -56213,7 +56213,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -56229,7 +56229,7 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -56247,7 +56247,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -56264,7 +56264,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -56280,7 +56280,7 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -56298,7 +56298,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -56315,7 +56315,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -56331,7 +56331,7 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -56349,7 +56349,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -56367,7 +56367,7 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -56385,7 +56385,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -56403,7 +56403,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -56422,7 +56422,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -56440,7 +56440,7 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -56458,7 +56458,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -56476,7 +56476,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -56495,7 +56495,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -56516,7 +56516,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -56533,7 +56533,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_3X4C2__SSE41_LD128, qmax) {
@@ -56547,7 +56547,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_3X4C2__SSE41_LD128, strided_cm) {
@@ -56561,7 +56561,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -56577,7 +56577,7 @@
       .m(4)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_4X4C2__SSE41_LD128, strided_cn) {
@@ -56591,7 +56591,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_4X4C2__SSE41_LD128, k_eq_8_strided_a) {
@@ -56605,7 +56605,7 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_4X4C2__SSE41_LD128, k_eq_8_subtile) {
@@ -56621,7 +56621,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -56638,7 +56638,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -56654,7 +56654,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -56669,7 +56669,7 @@
         .m(4)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -56685,7 +56685,7 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -56703,7 +56703,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -56720,7 +56720,7 @@
         .m(4)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -56736,7 +56736,7 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -56754,7 +56754,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -56771,7 +56771,7 @@
         .m(4)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -56787,7 +56787,7 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -56805,7 +56805,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -56823,7 +56823,7 @@
           .m(4)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -56841,7 +56841,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -56859,7 +56859,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -56878,7 +56878,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -56896,7 +56896,7 @@
           .m(4)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -56914,7 +56914,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -56932,7 +56932,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -56951,7 +56951,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -56972,7 +56972,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -56989,7 +56989,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_4X4C2__SSE41_LD128, qmax) {
@@ -57003,7 +57003,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_4X4C2__SSE41_LD128, strided_cm) {
@@ -57017,7 +57017,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -57033,7 +57033,7 @@
       .m(1)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_1X4C2__AVX_LD128, strided_cn) {
@@ -57047,7 +57047,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_1X4C2__AVX_LD128, k_eq_8_strided_a) {
@@ -57061,7 +57061,7 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_1X4C2__AVX_LD128, k_eq_8_subtile) {
@@ -57077,7 +57077,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -57094,7 +57094,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -57110,7 +57110,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -57125,7 +57125,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -57141,7 +57141,7 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -57159,7 +57159,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -57176,7 +57176,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -57192,7 +57192,7 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -57210,7 +57210,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -57227,7 +57227,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -57243,7 +57243,7 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -57261,7 +57261,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -57279,7 +57279,7 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -57297,7 +57297,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -57315,7 +57315,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -57334,7 +57334,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -57352,7 +57352,7 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -57370,7 +57370,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -57388,7 +57388,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -57407,7 +57407,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -57428,7 +57428,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -57445,7 +57445,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_1X4C2__AVX_LD128, qmax) {
@@ -57459,7 +57459,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_1X4C2__AVX_LD128, strided_cm) {
@@ -57473,7 +57473,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -57489,7 +57489,7 @@
       .m(2)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_2X4C2__AVX_LD128, strided_cn) {
@@ -57503,7 +57503,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_2X4C2__AVX_LD128, k_eq_8_strided_a) {
@@ -57517,7 +57517,7 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_2X4C2__AVX_LD128, k_eq_8_subtile) {
@@ -57533,7 +57533,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -57550,7 +57550,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -57566,7 +57566,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -57581,7 +57581,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -57597,7 +57597,7 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -57615,7 +57615,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -57632,7 +57632,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -57648,7 +57648,7 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -57666,7 +57666,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -57683,7 +57683,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -57699,7 +57699,7 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -57717,7 +57717,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -57735,7 +57735,7 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -57753,7 +57753,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -57771,7 +57771,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -57790,7 +57790,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -57808,7 +57808,7 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -57826,7 +57826,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -57844,7 +57844,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -57863,7 +57863,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -57884,7 +57884,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -57901,7 +57901,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_2X4C2__AVX_LD128, qmax) {
@@ -57915,7 +57915,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_2X4C2__AVX_LD128, strided_cm) {
@@ -57929,7 +57929,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -57945,7 +57945,7 @@
       .m(3)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_3X4C2__AVX_LD128, strided_cn) {
@@ -57959,7 +57959,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_3X4C2__AVX_LD128, k_eq_8_strided_a) {
@@ -57973,7 +57973,7 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_3X4C2__AVX_LD128, k_eq_8_subtile) {
@@ -57989,7 +57989,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -58006,7 +58006,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -58022,7 +58022,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -58037,7 +58037,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -58053,7 +58053,7 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -58071,7 +58071,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -58088,7 +58088,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -58104,7 +58104,7 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -58122,7 +58122,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -58139,7 +58139,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -58155,7 +58155,7 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -58173,7 +58173,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -58191,7 +58191,7 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -58209,7 +58209,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -58227,7 +58227,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -58246,7 +58246,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -58264,7 +58264,7 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -58282,7 +58282,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -58300,7 +58300,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -58319,7 +58319,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -58340,7 +58340,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -58357,7 +58357,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_3X4C2__AVX_LD128, qmax) {
@@ -58371,7 +58371,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_3X4C2__AVX_LD128, strided_cm) {
@@ -58385,7 +58385,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -58401,7 +58401,7 @@
       .m(4)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_4X4C2__AVX_LD128, strided_cn) {
@@ -58415,7 +58415,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_4X4C2__AVX_LD128, k_eq_8_strided_a) {
@@ -58429,7 +58429,7 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_4X4C2__AVX_LD128, k_eq_8_subtile) {
@@ -58445,7 +58445,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -58462,7 +58462,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -58478,7 +58478,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -58493,7 +58493,7 @@
         .m(4)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -58509,7 +58509,7 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -58527,7 +58527,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -58544,7 +58544,7 @@
         .m(4)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -58560,7 +58560,7 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -58578,7 +58578,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -58595,7 +58595,7 @@
         .m(4)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -58611,7 +58611,7 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -58629,7 +58629,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -58647,7 +58647,7 @@
           .m(4)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -58665,7 +58665,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -58683,7 +58683,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -58702,7 +58702,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -58720,7 +58720,7 @@
           .m(4)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -58738,7 +58738,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -58756,7 +58756,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -58775,7 +58775,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -58796,7 +58796,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -58813,7 +58813,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_4X4C2__AVX_LD128, qmax) {
@@ -58827,7 +58827,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_4X4C2__AVX_LD128, strided_cm) {
@@ -58841,7 +58841,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -58857,7 +58857,7 @@
       .m(1)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_1X4C2__XOP_LD128, strided_cn) {
@@ -58871,7 +58871,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_1X4C2__XOP_LD128, k_eq_8_strided_a) {
@@ -58885,7 +58885,7 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_1X4C2__XOP_LD128, k_eq_8_subtile) {
@@ -58901,7 +58901,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -58918,7 +58918,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -58934,7 +58934,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -58949,7 +58949,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -58965,7 +58965,7 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -58983,7 +58983,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -59000,7 +59000,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -59016,7 +59016,7 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -59034,7 +59034,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -59051,7 +59051,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -59067,7 +59067,7 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -59085,7 +59085,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -59103,7 +59103,7 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -59121,7 +59121,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -59139,7 +59139,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -59158,7 +59158,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -59176,7 +59176,7 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -59194,7 +59194,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -59212,7 +59212,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -59231,7 +59231,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -59252,7 +59252,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -59269,7 +59269,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_1X4C2__XOP_LD128, qmax) {
@@ -59283,7 +59283,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_1X4C2__XOP_LD128, strided_cm) {
@@ -59297,7 +59297,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -59313,7 +59313,7 @@
       .m(2)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_2X4C2__XOP_LD128, strided_cn) {
@@ -59327,7 +59327,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_2X4C2__XOP_LD128, k_eq_8_strided_a) {
@@ -59341,7 +59341,7 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_2X4C2__XOP_LD128, k_eq_8_subtile) {
@@ -59357,7 +59357,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -59374,7 +59374,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -59390,7 +59390,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -59405,7 +59405,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -59421,7 +59421,7 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -59439,7 +59439,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -59456,7 +59456,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -59472,7 +59472,7 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -59490,7 +59490,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -59507,7 +59507,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -59523,7 +59523,7 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -59541,7 +59541,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -59559,7 +59559,7 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -59577,7 +59577,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -59595,7 +59595,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -59614,7 +59614,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -59632,7 +59632,7 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -59650,7 +59650,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -59668,7 +59668,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -59687,7 +59687,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -59708,7 +59708,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -59725,7 +59725,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_2X4C2__XOP_LD128, qmax) {
@@ -59739,7 +59739,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_2X4C2__XOP_LD128, strided_cm) {
@@ -59753,7 +59753,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -59769,7 +59769,7 @@
       .m(3)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_3X4C2__XOP_LD128, strided_cn) {
@@ -59783,7 +59783,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_3X4C2__XOP_LD128, k_eq_8_strided_a) {
@@ -59797,7 +59797,7 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_3X4C2__XOP_LD128, k_eq_8_subtile) {
@@ -59813,7 +59813,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -59830,7 +59830,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -59846,7 +59846,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -59861,7 +59861,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -59877,7 +59877,7 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -59895,7 +59895,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -59912,7 +59912,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -59928,7 +59928,7 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -59946,7 +59946,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -59963,7 +59963,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -59979,7 +59979,7 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -59997,7 +59997,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -60015,7 +60015,7 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -60033,7 +60033,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -60051,7 +60051,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -60070,7 +60070,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -60088,7 +60088,7 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -60106,7 +60106,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -60124,7 +60124,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -60143,7 +60143,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -60164,7 +60164,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -60181,7 +60181,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_3X4C2__XOP_LD128, qmax) {
@@ -60195,7 +60195,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_3X4C2__XOP_LD128, strided_cm) {
@@ -60209,7 +60209,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -60225,7 +60225,7 @@
       .m(4)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_4X4C2__XOP_LD128, strided_cn) {
@@ -60239,7 +60239,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_4X4C2__XOP_LD128, k_eq_8_strided_a) {
@@ -60253,7 +60253,7 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_4X4C2__XOP_LD128, k_eq_8_subtile) {
@@ -60269,7 +60269,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -60286,7 +60286,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -60302,7 +60302,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -60317,7 +60317,7 @@
         .m(4)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -60333,7 +60333,7 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -60351,7 +60351,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -60368,7 +60368,7 @@
         .m(4)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -60384,7 +60384,7 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -60402,7 +60402,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -60419,7 +60419,7 @@
         .m(4)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -60435,7 +60435,7 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -60453,7 +60453,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -60471,7 +60471,7 @@
           .m(4)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -60489,7 +60489,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -60507,7 +60507,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -60526,7 +60526,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -60544,7 +60544,7 @@
           .m(4)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -60562,7 +60562,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -60580,7 +60580,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -60599,7 +60599,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -60620,7 +60620,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -60637,7 +60637,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_4X4C2__XOP_LD128, qmax) {
@@ -60651,7 +60651,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_4X4C2__XOP_LD128, strided_cm) {
@@ -60665,7 +60665,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -62494,7 +62494,7 @@
       .m(1)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse41, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse41, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_XW_MINMAX_1X4C2__SSE41, strided_cn) {
@@ -62509,7 +62509,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse41, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse41, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_XW_MINMAX_1X4C2__SSE41, k_eq_8_strided_a) {
@@ -62524,7 +62524,7 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse41, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse41, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_XW_MINMAX_1X4C2__SSE41, k_eq_8_subtile) {
@@ -62541,7 +62541,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse41, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse41, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -62559,7 +62559,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse41, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse41, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -62576,7 +62576,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse41, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse41, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -62592,7 +62592,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse41, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse41, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -62609,7 +62609,7 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse41, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse41, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -62628,7 +62628,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse41, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse41, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -62646,7 +62646,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse41, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse41, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -62663,7 +62663,7 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse41, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse41, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -62682,7 +62682,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse41, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse41, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -62700,7 +62700,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse41, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse41, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -62717,7 +62717,7 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse41, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse41, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -62736,7 +62736,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse41, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse41, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -62755,7 +62755,7 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse41, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse41, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -62774,7 +62774,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse41, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse41, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -62793,7 +62793,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse41, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse41, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -62813,7 +62813,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse41, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse41, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -62832,7 +62832,7 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse41, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse41, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -62851,7 +62851,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse41, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse41, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -62870,7 +62870,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse41, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse41, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -62890,7 +62890,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse41, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse41, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -62912,7 +62912,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse41, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse41, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -62930,7 +62930,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse41, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse41, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -62947,7 +62947,7 @@
       .m(4)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_XW_MINMAX_4X4C2__SSE41, strided_cn) {
@@ -62962,7 +62962,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_XW_MINMAX_4X4C2__SSE41, k_eq_8_strided_a) {
@@ -62977,7 +62977,7 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_XW_MINMAX_4X4C2__SSE41, k_eq_8_subtile) {
@@ -62994,7 +62994,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -63012,7 +63012,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -63029,7 +63029,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -63045,7 +63045,7 @@
         .m(4)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -63062,7 +63062,7 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -63081,7 +63081,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -63099,7 +63099,7 @@
         .m(4)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -63116,7 +63116,7 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -63135,7 +63135,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -63153,7 +63153,7 @@
         .m(4)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -63170,7 +63170,7 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -63189,7 +63189,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -63208,7 +63208,7 @@
           .m(4)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -63227,7 +63227,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -63246,7 +63246,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -63266,7 +63266,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -63285,7 +63285,7 @@
           .m(4)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -63304,7 +63304,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -63323,7 +63323,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -63343,7 +63343,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -63365,7 +63365,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -63383,7 +63383,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -63400,7 +63400,7 @@
       .m(1)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__avx, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__avx, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_XW_MINMAX_1X4C2__AVX, strided_cn) {
@@ -63415,7 +63415,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__avx, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__avx, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_XW_MINMAX_1X4C2__AVX, k_eq_8_strided_a) {
@@ -63430,7 +63430,7 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__avx, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__avx, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_XW_MINMAX_1X4C2__AVX, k_eq_8_subtile) {
@@ -63447,7 +63447,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__avx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__avx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -63465,7 +63465,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__avx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__avx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -63482,7 +63482,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__avx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__avx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -63498,7 +63498,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__avx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__avx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -63515,7 +63515,7 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__avx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__avx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -63534,7 +63534,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__avx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__avx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -63552,7 +63552,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__avx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__avx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -63569,7 +63569,7 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__avx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__avx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -63588,7 +63588,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__avx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__avx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -63606,7 +63606,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__avx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__avx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -63623,7 +63623,7 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__avx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__avx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -63642,7 +63642,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__avx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__avx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -63661,7 +63661,7 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__avx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__avx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -63680,7 +63680,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__avx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__avx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -63699,7 +63699,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__avx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__avx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -63719,7 +63719,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__avx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__avx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -63738,7 +63738,7 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__avx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__avx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -63757,7 +63757,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__avx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__avx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -63776,7 +63776,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__avx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__avx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -63796,7 +63796,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__avx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__avx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -63818,7 +63818,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__avx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__avx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -63836,7 +63836,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__avx, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__avx, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -63853,7 +63853,7 @@
       .m(4)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__avx, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__avx, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_XW_MINMAX_4X4C2__AVX, strided_cn) {
@@ -63868,7 +63868,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__avx, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__avx, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_XW_MINMAX_4X4C2__AVX, k_eq_8_strided_a) {
@@ -63883,7 +63883,7 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__avx, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__avx, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_XW_MINMAX_4X4C2__AVX, k_eq_8_subtile) {
@@ -63900,7 +63900,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__avx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__avx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -63918,7 +63918,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__avx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__avx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -63935,7 +63935,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__avx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__avx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -63951,7 +63951,7 @@
         .m(4)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__avx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__avx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -63968,7 +63968,7 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__avx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__avx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -63987,7 +63987,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__avx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__avx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -64005,7 +64005,7 @@
         .m(4)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__avx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__avx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -64022,7 +64022,7 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__avx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__avx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -64041,7 +64041,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__avx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__avx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -64059,7 +64059,7 @@
         .m(4)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__avx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__avx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -64076,7 +64076,7 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__avx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__avx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -64095,7 +64095,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__avx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__avx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -64114,7 +64114,7 @@
           .m(4)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__avx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__avx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -64133,7 +64133,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__avx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__avx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -64152,7 +64152,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__avx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__avx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -64172,7 +64172,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__avx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__avx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -64191,7 +64191,7 @@
           .m(4)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__avx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__avx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -64210,7 +64210,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__avx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__avx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -64229,7 +64229,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__avx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__avx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -64249,7 +64249,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__avx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__avx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -64271,7 +64271,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__avx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__avx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -64289,7 +64289,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__avx, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__avx, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -64306,7 +64306,7 @@
       .m(1)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__xop, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__xop, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_XW_MINMAX_1X4C2__XOP, strided_cn) {
@@ -64321,7 +64321,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__xop, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__xop, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_XW_MINMAX_1X4C2__XOP, k_eq_8_strided_a) {
@@ -64336,7 +64336,7 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__xop, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__xop, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_XW_MINMAX_1X4C2__XOP, k_eq_8_subtile) {
@@ -64353,7 +64353,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__xop, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__xop, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -64371,7 +64371,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__xop, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__xop, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -64388,7 +64388,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__xop, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__xop, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -64404,7 +64404,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__xop, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__xop, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -64421,7 +64421,7 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__xop, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__xop, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -64440,7 +64440,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__xop, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__xop, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -64458,7 +64458,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__xop, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__xop, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -64475,7 +64475,7 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__xop, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__xop, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -64494,7 +64494,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__xop, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__xop, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -64512,7 +64512,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__xop, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__xop, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -64529,7 +64529,7 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__xop, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__xop, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -64548,7 +64548,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__xop, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__xop, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -64567,7 +64567,7 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__xop, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__xop, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -64586,7 +64586,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__xop, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__xop, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -64605,7 +64605,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__xop, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__xop, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -64625,7 +64625,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__xop, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__xop, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -64644,7 +64644,7 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__xop, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__xop, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -64663,7 +64663,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__xop, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__xop, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -64682,7 +64682,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__xop, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__xop, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -64702,7 +64702,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__xop, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__xop, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -64724,7 +64724,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__xop, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__xop, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -64742,7 +64742,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__xop, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__xop, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -64759,7 +64759,7 @@
       .m(4)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_XW_MINMAX_4X4C2__XOP, strided_cn) {
@@ -64774,7 +64774,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_XW_MINMAX_4X4C2__XOP, k_eq_8_strided_a) {
@@ -64789,7 +64789,7 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_XW_MINMAX_4X4C2__XOP, k_eq_8_subtile) {
@@ -64806,7 +64806,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -64824,7 +64824,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -64841,7 +64841,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -64857,7 +64857,7 @@
         .m(4)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -64874,7 +64874,7 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -64893,7 +64893,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -64911,7 +64911,7 @@
         .m(4)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -64928,7 +64928,7 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -64947,7 +64947,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -64965,7 +64965,7 @@
         .m(4)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -64982,7 +64982,7 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -65001,7 +65001,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -65020,7 +65020,7 @@
           .m(4)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -65039,7 +65039,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -65058,7 +65058,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -65078,7 +65078,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -65097,7 +65097,7 @@
           .m(4)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -65116,7 +65116,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -65135,7 +65135,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -65155,7 +65155,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -65177,7 +65177,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -65195,7 +65195,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -67947,7 +67947,7 @@
       .m(1)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_1X4C8__SSE41_LD64, strided_cn) {
@@ -67961,7 +67961,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_1X4C8__SSE41_LD64, k_eq_8_strided_a) {
@@ -67975,7 +67975,7 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_1X4C8__SSE41_LD64, k_eq_8_subtile) {
@@ -67991,7 +67991,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -68008,7 +68008,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -68024,7 +68024,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -68039,7 +68039,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -68055,7 +68055,7 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -68073,7 +68073,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -68090,7 +68090,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -68106,7 +68106,7 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -68124,7 +68124,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -68141,7 +68141,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -68157,7 +68157,7 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -68175,7 +68175,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -68193,7 +68193,7 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -68211,7 +68211,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -68229,7 +68229,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -68248,7 +68248,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -68266,7 +68266,7 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -68284,7 +68284,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -68302,7 +68302,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -68321,7 +68321,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -68342,7 +68342,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -68359,7 +68359,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_1X4C8__SSE41_LD64, qmax) {
@@ -68373,7 +68373,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_1X4C8__SSE41_LD64, strided_cm) {
@@ -68387,7 +68387,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -68403,7 +68403,7 @@
       .m(2)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_2X4C8__SSE41_LD64, strided_cn) {
@@ -68417,7 +68417,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_2X4C8__SSE41_LD64, k_eq_8_strided_a) {
@@ -68431,7 +68431,7 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_2X4C8__SSE41_LD64, k_eq_8_subtile) {
@@ -68447,7 +68447,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -68464,7 +68464,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -68480,7 +68480,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -68495,7 +68495,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -68511,7 +68511,7 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -68529,7 +68529,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -68546,7 +68546,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -68562,7 +68562,7 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -68580,7 +68580,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -68597,7 +68597,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -68613,7 +68613,7 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -68631,7 +68631,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -68649,7 +68649,7 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -68667,7 +68667,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -68685,7 +68685,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -68704,7 +68704,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -68722,7 +68722,7 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -68740,7 +68740,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -68758,7 +68758,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -68777,7 +68777,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -68798,7 +68798,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -68815,7 +68815,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_2X4C8__SSE41_LD64, qmax) {
@@ -68829,7 +68829,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_2X4C8__SSE41_LD64, strided_cm) {
@@ -68843,7 +68843,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -68859,7 +68859,7 @@
       .m(3)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_3X4C8__SSE41_LD64, strided_cn) {
@@ -68873,7 +68873,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_3X4C8__SSE41_LD64, k_eq_8_strided_a) {
@@ -68887,7 +68887,7 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_3X4C8__SSE41_LD64, k_eq_8_subtile) {
@@ -68903,7 +68903,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -68920,7 +68920,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -68936,7 +68936,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -68951,7 +68951,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -68967,7 +68967,7 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -68985,7 +68985,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -69002,7 +69002,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -69018,7 +69018,7 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -69036,7 +69036,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -69053,7 +69053,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -69069,7 +69069,7 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -69087,7 +69087,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -69105,7 +69105,7 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -69123,7 +69123,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -69141,7 +69141,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -69160,7 +69160,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -69178,7 +69178,7 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -69196,7 +69196,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -69214,7 +69214,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -69233,7 +69233,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -69254,7 +69254,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -69271,7 +69271,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_3X4C8__SSE41_LD64, qmax) {
@@ -69285,7 +69285,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_3X4C8__SSE41_LD64, strided_cm) {
@@ -69299,7 +69299,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -69315,7 +69315,7 @@
       .m(1)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_1X4C8__AVX_LD64, strided_cn) {
@@ -69329,7 +69329,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_1X4C8__AVX_LD64, k_eq_8_strided_a) {
@@ -69343,7 +69343,7 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_1X4C8__AVX_LD64, k_eq_8_subtile) {
@@ -69359,7 +69359,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -69376,7 +69376,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -69392,7 +69392,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -69407,7 +69407,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -69423,7 +69423,7 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -69441,7 +69441,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -69458,7 +69458,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -69474,7 +69474,7 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -69492,7 +69492,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -69509,7 +69509,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -69525,7 +69525,7 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -69543,7 +69543,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -69561,7 +69561,7 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -69579,7 +69579,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -69597,7 +69597,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -69616,7 +69616,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -69634,7 +69634,7 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -69652,7 +69652,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -69670,7 +69670,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -69689,7 +69689,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -69710,7 +69710,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -69727,7 +69727,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_1X4C8__AVX_LD64, qmax) {
@@ -69741,7 +69741,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_1X4C8__AVX_LD64, strided_cm) {
@@ -69755,7 +69755,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -69771,7 +69771,7 @@
       .m(2)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_2X4C8__AVX_LD64, strided_cn) {
@@ -69785,7 +69785,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_2X4C8__AVX_LD64, k_eq_8_strided_a) {
@@ -69799,7 +69799,7 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_2X4C8__AVX_LD64, k_eq_8_subtile) {
@@ -69815,7 +69815,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -69832,7 +69832,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -69848,7 +69848,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -69863,7 +69863,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -69879,7 +69879,7 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -69897,7 +69897,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -69914,7 +69914,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -69930,7 +69930,7 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -69948,7 +69948,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -69965,7 +69965,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -69981,7 +69981,7 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -69999,7 +69999,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -70017,7 +70017,7 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -70035,7 +70035,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -70053,7 +70053,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -70072,7 +70072,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -70090,7 +70090,7 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -70108,7 +70108,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -70126,7 +70126,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -70145,7 +70145,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -70166,7 +70166,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -70183,7 +70183,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_2X4C8__AVX_LD64, qmax) {
@@ -70197,7 +70197,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_2X4C8__AVX_LD64, strided_cm) {
@@ -70211,7 +70211,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -70227,7 +70227,7 @@
       .m(3)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_3X4C8__AVX_LD64, strided_cn) {
@@ -70241,7 +70241,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_3X4C8__AVX_LD64, k_eq_8_strided_a) {
@@ -70255,7 +70255,7 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_3X4C8__AVX_LD64, k_eq_8_subtile) {
@@ -70271,7 +70271,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -70288,7 +70288,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -70304,7 +70304,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -70319,7 +70319,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -70335,7 +70335,7 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -70353,7 +70353,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -70370,7 +70370,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -70386,7 +70386,7 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -70404,7 +70404,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -70421,7 +70421,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -70437,7 +70437,7 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -70455,7 +70455,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -70473,7 +70473,7 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -70491,7 +70491,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -70509,7 +70509,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -70528,7 +70528,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -70546,7 +70546,7 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -70564,7 +70564,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -70582,7 +70582,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -70601,7 +70601,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -70622,7 +70622,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -70639,7 +70639,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_3X4C8__AVX_LD64, qmax) {
@@ -70653,7 +70653,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_3X4C8__AVX_LD64, strided_cm) {
@@ -70667,7 +70667,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -70683,7 +70683,7 @@
       .m(1)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_1X4C8__XOP_LD64, strided_cn) {
@@ -70697,7 +70697,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_1X4C8__XOP_LD64, k_eq_8_strided_a) {
@@ -70711,7 +70711,7 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_1X4C8__XOP_LD64, k_eq_8_subtile) {
@@ -70727,7 +70727,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -70744,7 +70744,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -70760,7 +70760,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -70775,7 +70775,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -70791,7 +70791,7 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -70809,7 +70809,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -70826,7 +70826,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -70842,7 +70842,7 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -70860,7 +70860,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -70877,7 +70877,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -70893,7 +70893,7 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -70911,7 +70911,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -70929,7 +70929,7 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -70947,7 +70947,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -70965,7 +70965,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -70984,7 +70984,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -71002,7 +71002,7 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -71020,7 +71020,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -71038,7 +71038,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -71057,7 +71057,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -71078,7 +71078,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -71095,7 +71095,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_1X4C8__XOP_LD64, qmax) {
@@ -71109,7 +71109,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_1X4C8__XOP_LD64, strided_cm) {
@@ -71123,7 +71123,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -71139,7 +71139,7 @@
       .m(2)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_2X4C8__XOP_LD64, strided_cn) {
@@ -71153,7 +71153,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_2X4C8__XOP_LD64, k_eq_8_strided_a) {
@@ -71167,7 +71167,7 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_2X4C8__XOP_LD64, k_eq_8_subtile) {
@@ -71183,7 +71183,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -71200,7 +71200,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -71216,7 +71216,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -71231,7 +71231,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -71247,7 +71247,7 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -71265,7 +71265,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -71282,7 +71282,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -71298,7 +71298,7 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -71316,7 +71316,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -71333,7 +71333,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -71349,7 +71349,7 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -71367,7 +71367,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -71385,7 +71385,7 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -71403,7 +71403,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -71421,7 +71421,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -71440,7 +71440,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -71458,7 +71458,7 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -71476,7 +71476,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -71494,7 +71494,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -71513,7 +71513,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -71534,7 +71534,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -71551,7 +71551,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_2X4C8__XOP_LD64, qmax) {
@@ -71565,7 +71565,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_2X4C8__XOP_LD64, strided_cm) {
@@ -71579,7 +71579,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -71595,7 +71595,7 @@
       .m(3)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_3X4C8__XOP_LD64, strided_cn) {
@@ -71609,7 +71609,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_3X4C8__XOP_LD64, k_eq_8_strided_a) {
@@ -71623,7 +71623,7 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_3X4C8__XOP_LD64, k_eq_8_subtile) {
@@ -71639,7 +71639,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -71656,7 +71656,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -71672,7 +71672,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -71687,7 +71687,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -71703,7 +71703,7 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -71721,7 +71721,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -71738,7 +71738,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -71754,7 +71754,7 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -71772,7 +71772,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -71789,7 +71789,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -71805,7 +71805,7 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -71823,7 +71823,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -71841,7 +71841,7 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -71859,7 +71859,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -71877,7 +71877,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -71896,7 +71896,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -71914,7 +71914,7 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -71932,7 +71932,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -71950,7 +71950,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -71969,7 +71969,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -71990,7 +71990,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -72007,7 +72007,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_3X4C8__XOP_LD64, qmax) {
@@ -72021,7 +72021,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_3X4C8__XOP_LD64, strided_cm) {
@@ -72035,7 +72035,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -74787,7 +74787,7 @@
       .m(1)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_1X4C8__SSE41_LD128, strided_cn) {
@@ -74801,7 +74801,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_1X4C8__SSE41_LD128, k_eq_8_strided_a) {
@@ -74815,7 +74815,7 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_1X4C8__SSE41_LD128, k_eq_8_subtile) {
@@ -74831,7 +74831,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -74848,7 +74848,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -74864,7 +74864,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -74879,7 +74879,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -74895,7 +74895,7 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -74913,7 +74913,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -74930,7 +74930,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -74946,7 +74946,7 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -74964,7 +74964,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -74981,7 +74981,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -74997,7 +74997,7 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -75015,7 +75015,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -75033,7 +75033,7 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -75051,7 +75051,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -75069,7 +75069,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -75088,7 +75088,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -75106,7 +75106,7 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -75124,7 +75124,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -75142,7 +75142,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -75161,7 +75161,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -75182,7 +75182,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -75199,7 +75199,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_1X4C8__SSE41_LD128, qmax) {
@@ -75213,7 +75213,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_1X4C8__SSE41_LD128, strided_cm) {
@@ -75227,7 +75227,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -75243,7 +75243,7 @@
       .m(2)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_2X4C8__SSE41_LD128, strided_cn) {
@@ -75257,7 +75257,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_2X4C8__SSE41_LD128, k_eq_8_strided_a) {
@@ -75271,7 +75271,7 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_2X4C8__SSE41_LD128, k_eq_8_subtile) {
@@ -75287,7 +75287,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -75304,7 +75304,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -75320,7 +75320,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -75335,7 +75335,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -75351,7 +75351,7 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -75369,7 +75369,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -75386,7 +75386,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -75402,7 +75402,7 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -75420,7 +75420,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -75437,7 +75437,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -75453,7 +75453,7 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -75471,7 +75471,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -75489,7 +75489,7 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -75507,7 +75507,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -75525,7 +75525,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -75544,7 +75544,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -75562,7 +75562,7 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -75580,7 +75580,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -75598,7 +75598,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -75617,7 +75617,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -75638,7 +75638,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -75655,7 +75655,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_2X4C8__SSE41_LD128, qmax) {
@@ -75669,7 +75669,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_2X4C8__SSE41_LD128, strided_cm) {
@@ -75683,7 +75683,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -75699,7 +75699,7 @@
       .m(3)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_3X4C8__SSE41_LD128, strided_cn) {
@@ -75713,7 +75713,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_3X4C8__SSE41_LD128, k_eq_8_strided_a) {
@@ -75727,7 +75727,7 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_3X4C8__SSE41_LD128, k_eq_8_subtile) {
@@ -75743,7 +75743,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -75760,7 +75760,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -75776,7 +75776,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -75791,7 +75791,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -75807,7 +75807,7 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -75825,7 +75825,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -75842,7 +75842,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -75858,7 +75858,7 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -75876,7 +75876,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -75893,7 +75893,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -75909,7 +75909,7 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -75927,7 +75927,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -75945,7 +75945,7 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -75963,7 +75963,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -75981,7 +75981,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -76000,7 +76000,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -76018,7 +76018,7 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -76036,7 +76036,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -76054,7 +76054,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -76073,7 +76073,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -76094,7 +76094,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -76111,7 +76111,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_3X4C8__SSE41_LD128, qmax) {
@@ -76125,7 +76125,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_3X4C8__SSE41_LD128, strided_cm) {
@@ -76139,7 +76139,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -76155,7 +76155,7 @@
       .m(1)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_1X4C8__AVX_LD128, strided_cn) {
@@ -76169,7 +76169,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_1X4C8__AVX_LD128, k_eq_8_strided_a) {
@@ -76183,7 +76183,7 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_1X4C8__AVX_LD128, k_eq_8_subtile) {
@@ -76199,7 +76199,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -76216,7 +76216,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -76232,7 +76232,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -76247,7 +76247,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -76263,7 +76263,7 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -76281,7 +76281,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -76298,7 +76298,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -76314,7 +76314,7 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -76332,7 +76332,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -76349,7 +76349,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -76365,7 +76365,7 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -76383,7 +76383,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -76401,7 +76401,7 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -76419,7 +76419,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -76437,7 +76437,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -76456,7 +76456,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -76474,7 +76474,7 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -76492,7 +76492,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -76510,7 +76510,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -76529,7 +76529,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -76550,7 +76550,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -76567,7 +76567,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_1X4C8__AVX_LD128, qmax) {
@@ -76581,7 +76581,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_1X4C8__AVX_LD128, strided_cm) {
@@ -76595,7 +76595,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -76611,7 +76611,7 @@
       .m(2)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_2X4C8__AVX_LD128, strided_cn) {
@@ -76625,7 +76625,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_2X4C8__AVX_LD128, k_eq_8_strided_a) {
@@ -76639,7 +76639,7 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_2X4C8__AVX_LD128, k_eq_8_subtile) {
@@ -76655,7 +76655,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -76672,7 +76672,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -76688,7 +76688,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -76703,7 +76703,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -76719,7 +76719,7 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -76737,7 +76737,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -76754,7 +76754,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -76770,7 +76770,7 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -76788,7 +76788,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -76805,7 +76805,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -76821,7 +76821,7 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -76839,7 +76839,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -76857,7 +76857,7 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -76875,7 +76875,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -76893,7 +76893,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -76912,7 +76912,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -76930,7 +76930,7 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -76948,7 +76948,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -76966,7 +76966,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -76985,7 +76985,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -77006,7 +77006,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -77023,7 +77023,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_2X4C8__AVX_LD128, qmax) {
@@ -77037,7 +77037,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_2X4C8__AVX_LD128, strided_cm) {
@@ -77051,7 +77051,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -77067,7 +77067,7 @@
       .m(3)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_3X4C8__AVX_LD128, strided_cn) {
@@ -77081,7 +77081,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_3X4C8__AVX_LD128, k_eq_8_strided_a) {
@@ -77095,7 +77095,7 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_3X4C8__AVX_LD128, k_eq_8_subtile) {
@@ -77111,7 +77111,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -77128,7 +77128,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -77144,7 +77144,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -77159,7 +77159,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -77175,7 +77175,7 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -77193,7 +77193,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -77210,7 +77210,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -77226,7 +77226,7 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -77244,7 +77244,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -77261,7 +77261,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -77277,7 +77277,7 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -77295,7 +77295,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -77313,7 +77313,7 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -77331,7 +77331,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -77349,7 +77349,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -77368,7 +77368,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -77386,7 +77386,7 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -77404,7 +77404,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -77422,7 +77422,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -77441,7 +77441,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -77462,7 +77462,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -77479,7 +77479,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_3X4C8__AVX_LD128, qmax) {
@@ -77493,7 +77493,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_3X4C8__AVX_LD128, strided_cm) {
@@ -77507,7 +77507,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -77523,7 +77523,7 @@
       .m(1)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_1X4C8__XOP_LD128, strided_cn) {
@@ -77537,7 +77537,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_1X4C8__XOP_LD128, k_eq_8_strided_a) {
@@ -77551,7 +77551,7 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_1X4C8__XOP_LD128, k_eq_8_subtile) {
@@ -77567,7 +77567,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -77584,7 +77584,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -77600,7 +77600,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -77615,7 +77615,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -77631,7 +77631,7 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -77649,7 +77649,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -77666,7 +77666,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -77682,7 +77682,7 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -77700,7 +77700,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -77717,7 +77717,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -77733,7 +77733,7 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -77751,7 +77751,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -77769,7 +77769,7 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -77787,7 +77787,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -77805,7 +77805,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -77824,7 +77824,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -77842,7 +77842,7 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -77860,7 +77860,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -77878,7 +77878,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -77897,7 +77897,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -77918,7 +77918,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -77935,7 +77935,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_1X4C8__XOP_LD128, qmax) {
@@ -77949,7 +77949,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_1X4C8__XOP_LD128, strided_cm) {
@@ -77963,7 +77963,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -77979,7 +77979,7 @@
       .m(2)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_2X4C8__XOP_LD128, strided_cn) {
@@ -77993,7 +77993,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_2X4C8__XOP_LD128, k_eq_8_strided_a) {
@@ -78007,7 +78007,7 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_2X4C8__XOP_LD128, k_eq_8_subtile) {
@@ -78023,7 +78023,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -78040,7 +78040,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -78056,7 +78056,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -78071,7 +78071,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -78087,7 +78087,7 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -78105,7 +78105,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -78122,7 +78122,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -78138,7 +78138,7 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -78156,7 +78156,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -78173,7 +78173,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -78189,7 +78189,7 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -78207,7 +78207,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -78225,7 +78225,7 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -78243,7 +78243,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -78261,7 +78261,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -78280,7 +78280,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -78298,7 +78298,7 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -78316,7 +78316,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -78334,7 +78334,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -78353,7 +78353,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -78374,7 +78374,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -78391,7 +78391,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_2X4C8__XOP_LD128, qmax) {
@@ -78405,7 +78405,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_2X4C8__XOP_LD128, strided_cm) {
@@ -78419,7 +78419,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -78435,7 +78435,7 @@
       .m(3)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_3X4C8__XOP_LD128, strided_cn) {
@@ -78449,7 +78449,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_3X4C8__XOP_LD128, k_eq_8_strided_a) {
@@ -78463,7 +78463,7 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_3X4C8__XOP_LD128, k_eq_8_subtile) {
@@ -78479,7 +78479,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -78496,7 +78496,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -78512,7 +78512,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -78527,7 +78527,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -78543,7 +78543,7 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -78561,7 +78561,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -78578,7 +78578,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -78594,7 +78594,7 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -78612,7 +78612,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -78629,7 +78629,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -78645,7 +78645,7 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -78663,7 +78663,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -78681,7 +78681,7 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -78699,7 +78699,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -78717,7 +78717,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -78736,7 +78736,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -78754,7 +78754,7 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -78772,7 +78772,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -78790,7 +78790,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -78809,7 +78809,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -78830,7 +78830,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -78847,7 +78847,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_3X4C8__XOP_LD128, qmax) {
@@ -78861,7 +78861,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_3X4C8__XOP_LD128, strided_cm) {
@@ -78875,7 +78875,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -81610,7 +81610,7 @@
       .m(1)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_XW_MINMAX_1X4C8__SSE41, strided_cn) {
@@ -81625,7 +81625,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_XW_MINMAX_1X4C8__SSE41, k_eq_8_strided_a) {
@@ -81640,7 +81640,7 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_XW_MINMAX_1X4C8__SSE41, k_eq_8_subtile) {
@@ -81657,7 +81657,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -81675,7 +81675,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -81692,7 +81692,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -81708,7 +81708,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -81725,7 +81725,7 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -81744,7 +81744,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -81762,7 +81762,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -81779,7 +81779,7 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -81798,7 +81798,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -81816,7 +81816,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -81833,7 +81833,7 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -81852,7 +81852,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -81871,7 +81871,7 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -81890,7 +81890,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -81909,7 +81909,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -81929,7 +81929,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -81948,7 +81948,7 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -81967,7 +81967,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -81986,7 +81986,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -82006,7 +82006,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -82028,7 +82028,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -82046,7 +82046,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -82063,7 +82063,7 @@
       .m(2)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_XW_MINMAX_2X4C8__SSE41, strided_cn) {
@@ -82078,7 +82078,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_XW_MINMAX_2X4C8__SSE41, k_eq_8_strided_a) {
@@ -82093,7 +82093,7 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_XW_MINMAX_2X4C8__SSE41, k_eq_8_subtile) {
@@ -82110,7 +82110,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -82128,7 +82128,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -82145,7 +82145,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -82161,7 +82161,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -82178,7 +82178,7 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -82197,7 +82197,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -82215,7 +82215,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -82232,7 +82232,7 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -82251,7 +82251,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -82269,7 +82269,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -82286,7 +82286,7 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -82305,7 +82305,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -82324,7 +82324,7 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -82343,7 +82343,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -82362,7 +82362,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -82382,7 +82382,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -82401,7 +82401,7 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -82420,7 +82420,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -82439,7 +82439,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -82459,7 +82459,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -82481,7 +82481,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -82499,7 +82499,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -82516,7 +82516,7 @@
       .m(3)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_XW_MINMAX_3X4C8__SSE41, strided_cn) {
@@ -82531,7 +82531,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_XW_MINMAX_3X4C8__SSE41, k_eq_8_strided_a) {
@@ -82546,7 +82546,7 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_XW_MINMAX_3X4C8__SSE41, k_eq_8_subtile) {
@@ -82563,7 +82563,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -82581,7 +82581,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -82598,7 +82598,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -82614,7 +82614,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -82631,7 +82631,7 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -82650,7 +82650,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -82668,7 +82668,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -82685,7 +82685,7 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -82704,7 +82704,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -82722,7 +82722,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -82739,7 +82739,7 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -82758,7 +82758,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -82777,7 +82777,7 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -82796,7 +82796,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -82815,7 +82815,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -82835,7 +82835,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -82854,7 +82854,7 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -82873,7 +82873,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -82892,7 +82892,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -82912,7 +82912,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -82934,7 +82934,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -82952,7 +82952,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__sse41, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__sse41, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -82969,7 +82969,7 @@
       .m(1)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__avx, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_XW_MINMAX_1X4C8__AVX, strided_cn) {
@@ -82984,7 +82984,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__avx, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_XW_MINMAX_1X4C8__AVX, k_eq_8_strided_a) {
@@ -82999,7 +82999,7 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__avx, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_XW_MINMAX_1X4C8__AVX, k_eq_8_subtile) {
@@ -83016,7 +83016,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__avx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -83034,7 +83034,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__avx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -83051,7 +83051,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__avx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -83067,7 +83067,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__avx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -83084,7 +83084,7 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__avx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -83103,7 +83103,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__avx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -83121,7 +83121,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__avx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -83138,7 +83138,7 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__avx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -83157,7 +83157,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__avx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -83175,7 +83175,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__avx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -83192,7 +83192,7 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__avx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -83211,7 +83211,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__avx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -83230,7 +83230,7 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__avx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -83249,7 +83249,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__avx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -83268,7 +83268,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__avx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -83288,7 +83288,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__avx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -83307,7 +83307,7 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__avx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -83326,7 +83326,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__avx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -83345,7 +83345,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__avx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -83365,7 +83365,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__avx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -83387,7 +83387,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__avx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -83405,7 +83405,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__avx, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -83422,7 +83422,7 @@
       .m(2)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__avx, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_XW_MINMAX_2X4C8__AVX, strided_cn) {
@@ -83437,7 +83437,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__avx, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_XW_MINMAX_2X4C8__AVX, k_eq_8_strided_a) {
@@ -83452,7 +83452,7 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__avx, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_XW_MINMAX_2X4C8__AVX, k_eq_8_subtile) {
@@ -83469,7 +83469,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__avx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -83487,7 +83487,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__avx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -83504,7 +83504,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__avx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -83520,7 +83520,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__avx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -83537,7 +83537,7 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__avx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -83556,7 +83556,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__avx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -83574,7 +83574,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__avx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -83591,7 +83591,7 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__avx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -83610,7 +83610,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__avx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -83628,7 +83628,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__avx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -83645,7 +83645,7 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__avx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -83664,7 +83664,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__avx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -83683,7 +83683,7 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__avx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -83702,7 +83702,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__avx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -83721,7 +83721,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__avx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -83741,7 +83741,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__avx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -83760,7 +83760,7 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__avx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -83779,7 +83779,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__avx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -83798,7 +83798,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__avx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -83818,7 +83818,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__avx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -83840,7 +83840,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__avx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -83858,7 +83858,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__avx, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -83875,7 +83875,7 @@
       .m(3)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__avx, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_XW_MINMAX_3X4C8__AVX, strided_cn) {
@@ -83890,7 +83890,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__avx, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_XW_MINMAX_3X4C8__AVX, k_eq_8_strided_a) {
@@ -83905,7 +83905,7 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__avx, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_XW_MINMAX_3X4C8__AVX, k_eq_8_subtile) {
@@ -83922,7 +83922,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__avx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -83940,7 +83940,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__avx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -83957,7 +83957,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__avx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -83973,7 +83973,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__avx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -83990,7 +83990,7 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__avx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -84009,7 +84009,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__avx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -84027,7 +84027,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__avx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -84044,7 +84044,7 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__avx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -84063,7 +84063,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__avx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -84081,7 +84081,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__avx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -84098,7 +84098,7 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__avx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -84117,7 +84117,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__avx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -84136,7 +84136,7 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__avx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -84155,7 +84155,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__avx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -84174,7 +84174,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__avx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -84194,7 +84194,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__avx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -84213,7 +84213,7 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__avx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -84232,7 +84232,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__avx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -84251,7 +84251,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__avx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -84271,7 +84271,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__avx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -84293,7 +84293,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__avx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -84311,7 +84311,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__avx, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__avx, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -84328,7 +84328,7 @@
       .m(1)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__xop, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_XW_MINMAX_1X4C8__XOP, strided_cn) {
@@ -84343,7 +84343,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__xop, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_XW_MINMAX_1X4C8__XOP, k_eq_8_strided_a) {
@@ -84358,7 +84358,7 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__xop, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_XW_MINMAX_1X4C8__XOP, k_eq_8_subtile) {
@@ -84375,7 +84375,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__xop, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -84393,7 +84393,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__xop, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -84410,7 +84410,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__xop, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -84426,7 +84426,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__xop, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -84443,7 +84443,7 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__xop, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -84462,7 +84462,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__xop, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -84480,7 +84480,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__xop, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -84497,7 +84497,7 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__xop, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -84516,7 +84516,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__xop, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -84534,7 +84534,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__xop, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -84551,7 +84551,7 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__xop, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -84570,7 +84570,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__xop, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -84589,7 +84589,7 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__xop, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -84608,7 +84608,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__xop, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -84627,7 +84627,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__xop, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -84647,7 +84647,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__xop, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -84666,7 +84666,7 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__xop, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -84685,7 +84685,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__xop, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -84704,7 +84704,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__xop, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -84724,7 +84724,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__xop, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -84746,7 +84746,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__xop, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -84764,7 +84764,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__xop, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -84781,7 +84781,7 @@
       .m(2)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__xop, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_XW_MINMAX_2X4C8__XOP, strided_cn) {
@@ -84796,7 +84796,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__xop, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_XW_MINMAX_2X4C8__XOP, k_eq_8_strided_a) {
@@ -84811,7 +84811,7 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__xop, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_XW_MINMAX_2X4C8__XOP, k_eq_8_subtile) {
@@ -84828,7 +84828,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__xop, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -84846,7 +84846,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__xop, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -84863,7 +84863,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__xop, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -84879,7 +84879,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__xop, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -84896,7 +84896,7 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__xop, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -84915,7 +84915,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__xop, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -84933,7 +84933,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__xop, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -84950,7 +84950,7 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__xop, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -84969,7 +84969,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__xop, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -84987,7 +84987,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__xop, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -85004,7 +85004,7 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__xop, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -85023,7 +85023,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__xop, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -85042,7 +85042,7 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__xop, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -85061,7 +85061,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__xop, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -85080,7 +85080,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__xop, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -85100,7 +85100,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__xop, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -85119,7 +85119,7 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__xop, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -85138,7 +85138,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__xop, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -85157,7 +85157,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__xop, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -85177,7 +85177,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__xop, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -85199,7 +85199,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__xop, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -85217,7 +85217,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__xop, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -85234,7 +85234,7 @@
       .m(3)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__xop, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_XW_MINMAX_3X4C8__XOP, strided_cn) {
@@ -85249,7 +85249,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__xop, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_XW_MINMAX_3X4C8__XOP, k_eq_8_strided_a) {
@@ -85264,7 +85264,7 @@
       .n(4)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__xop, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_XW_MINMAX_3X4C8__XOP, k_eq_8_subtile) {
@@ -85281,7 +85281,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__xop, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -85299,7 +85299,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__xop, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -85316,7 +85316,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__xop, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -85332,7 +85332,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__xop, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -85349,7 +85349,7 @@
         .n(4)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__xop, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -85368,7 +85368,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__xop, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -85386,7 +85386,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__xop, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -85403,7 +85403,7 @@
         .n(4)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__xop, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -85422,7 +85422,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__xop, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -85440,7 +85440,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__xop, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -85457,7 +85457,7 @@
         .n(4)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__xop, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -85476,7 +85476,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__xop, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -85495,7 +85495,7 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__xop, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -85514,7 +85514,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__xop, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -85533,7 +85533,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__xop, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -85553,7 +85553,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__xop, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -85572,7 +85572,7 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__xop, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -85591,7 +85591,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__xop, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -85610,7 +85610,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__xop, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -85630,7 +85630,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__xop, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -85652,7 +85652,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__xop, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -85670,7 +85670,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__xop, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__xop, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -85686,7 +85686,7 @@
       .m(1)
       .n(8)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
   }
 
   TEST(QS8_GEMM_MINMAX_1X8C8__AVX2, strided_cn) {
@@ -85700,7 +85700,7 @@
       .n(8)
       .k(8)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
   }
 
   TEST(QS8_GEMM_MINMAX_1X8C8__AVX2, k_eq_8_strided_a) {
@@ -85714,7 +85714,7 @@
       .n(8)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
   }
 
   TEST(QS8_GEMM_MINMAX_1X8C8__AVX2, k_eq_8_subtile) {
@@ -85730,7 +85730,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -85747,7 +85747,7 @@
         .n(8)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -85763,7 +85763,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -85778,7 +85778,7 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -85794,7 +85794,7 @@
         .n(8)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -85812,7 +85812,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
         }
       }
     }
@@ -85829,7 +85829,7 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -85845,7 +85845,7 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -85863,7 +85863,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
         }
       }
     }
@@ -85880,7 +85880,7 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -85896,7 +85896,7 @@
         .n(8)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -85914,7 +85914,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
         }
       }
     }
@@ -85932,7 +85932,7 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -85950,7 +85950,7 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -85968,7 +85968,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -85987,7 +85987,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
         }
       }
     }
@@ -86005,7 +86005,7 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -86023,7 +86023,7 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -86041,7 +86041,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -86060,7 +86060,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
         }
       }
     }
@@ -86081,7 +86081,7 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
         }
       }
     }
@@ -86098,7 +86098,7 @@
       .n(8)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
   }
 
   TEST(QS8_GEMM_MINMAX_1X8C8__AVX2, qmax) {
@@ -86112,7 +86112,7 @@
       .n(8)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
   }
 
   TEST(QS8_GEMM_MINMAX_1X8C8__AVX2, strided_cm) {
@@ -86126,7 +86126,7 @@
       .n(8)
       .k(8)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -86142,7 +86142,7 @@
       .m(2)
       .n(8)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
   }
 
   TEST(QS8_GEMM_MINMAX_2X8C8__AVX2, strided_cn) {
@@ -86156,7 +86156,7 @@
       .n(8)
       .k(8)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
   }
 
   TEST(QS8_GEMM_MINMAX_2X8C8__AVX2, k_eq_8_strided_a) {
@@ -86170,7 +86170,7 @@
       .n(8)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
   }
 
   TEST(QS8_GEMM_MINMAX_2X8C8__AVX2, k_eq_8_subtile) {
@@ -86186,7 +86186,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -86203,7 +86203,7 @@
         .n(8)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -86219,7 +86219,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -86234,7 +86234,7 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -86250,7 +86250,7 @@
         .n(8)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -86268,7 +86268,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
         }
       }
     }
@@ -86285,7 +86285,7 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -86301,7 +86301,7 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -86319,7 +86319,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
         }
       }
     }
@@ -86336,7 +86336,7 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -86352,7 +86352,7 @@
         .n(8)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -86370,7 +86370,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
         }
       }
     }
@@ -86388,7 +86388,7 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -86406,7 +86406,7 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -86424,7 +86424,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -86443,7 +86443,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
         }
       }
     }
@@ -86461,7 +86461,7 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -86479,7 +86479,7 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -86497,7 +86497,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -86516,7 +86516,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
         }
       }
     }
@@ -86537,7 +86537,7 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
         }
       }
     }
@@ -86554,7 +86554,7 @@
       .n(8)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
   }
 
   TEST(QS8_GEMM_MINMAX_2X8C8__AVX2, qmax) {
@@ -86568,7 +86568,7 @@
       .n(8)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
   }
 
   TEST(QS8_GEMM_MINMAX_2X8C8__AVX2, strided_cm) {
@@ -86582,7 +86582,7 @@
       .n(8)
       .k(8)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -86598,7 +86598,7 @@
       .m(3)
       .n(8)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
   }
 
   TEST(QS8_GEMM_MINMAX_3X8C8__AVX2, strided_cn) {
@@ -86612,7 +86612,7 @@
       .n(8)
       .k(8)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
   }
 
   TEST(QS8_GEMM_MINMAX_3X8C8__AVX2, k_eq_8_strided_a) {
@@ -86626,7 +86626,7 @@
       .n(8)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
   }
 
   TEST(QS8_GEMM_MINMAX_3X8C8__AVX2, k_eq_8_subtile) {
@@ -86642,7 +86642,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -86659,7 +86659,7 @@
         .n(8)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -86675,7 +86675,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -86690,7 +86690,7 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -86706,7 +86706,7 @@
         .n(8)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -86724,7 +86724,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
         }
       }
     }
@@ -86741,7 +86741,7 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -86757,7 +86757,7 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -86775,7 +86775,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
         }
       }
     }
@@ -86792,7 +86792,7 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -86808,7 +86808,7 @@
         .n(8)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -86826,7 +86826,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
         }
       }
     }
@@ -86844,7 +86844,7 @@
           .m(3)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -86862,7 +86862,7 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -86880,7 +86880,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -86899,7 +86899,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
         }
       }
     }
@@ -86917,7 +86917,7 @@
           .m(3)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -86935,7 +86935,7 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -86953,7 +86953,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -86972,7 +86972,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
         }
       }
     }
@@ -86993,7 +86993,7 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
         }
       }
     }
@@ -87010,7 +87010,7 @@
       .n(8)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
   }
 
   TEST(QS8_GEMM_MINMAX_3X8C8__AVX2, qmax) {
@@ -87024,7 +87024,7 @@
       .n(8)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
   }
 
   TEST(QS8_GEMM_MINMAX_3X8C8__AVX2, strided_cm) {
@@ -87038,7 +87038,7 @@
       .n(8)
       .k(8)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -87055,7 +87055,7 @@
       .m(1)
       .n(8)
       .k(8)
-      .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
   }
 
   TEST(QS8_GEMM_XW_MINMAX_1X8C8__AVX2, strided_cn) {
@@ -87070,7 +87070,7 @@
       .n(8)
       .k(8)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
   }
 
   TEST(QS8_GEMM_XW_MINMAX_1X8C8__AVX2, k_eq_8_strided_a) {
@@ -87085,7 +87085,7 @@
       .n(8)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
   }
 
   TEST(QS8_GEMM_XW_MINMAX_1X8C8__AVX2, k_eq_8_subtile) {
@@ -87102,7 +87102,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -87120,7 +87120,7 @@
         .n(8)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -87137,7 +87137,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -87153,7 +87153,7 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -87170,7 +87170,7 @@
         .n(8)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -87189,7 +87189,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
         }
       }
     }
@@ -87207,7 +87207,7 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -87224,7 +87224,7 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -87243,7 +87243,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
         }
       }
     }
@@ -87261,7 +87261,7 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -87278,7 +87278,7 @@
         .n(8)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -87297,7 +87297,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
         }
       }
     }
@@ -87316,7 +87316,7 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -87335,7 +87335,7 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -87354,7 +87354,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -87374,7 +87374,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
         }
       }
     }
@@ -87393,7 +87393,7 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -87412,7 +87412,7 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -87431,7 +87431,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -87451,7 +87451,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
         }
       }
     }
@@ -87473,7 +87473,7 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
         }
       }
     }
@@ -87491,7 +87491,7 @@
       .n(8)
       .k(8)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -87508,7 +87508,7 @@
       .m(2)
       .n(8)
       .k(8)
-      .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
   }
 
   TEST(QS8_GEMM_XW_MINMAX_2X8C8__AVX2, strided_cn) {
@@ -87523,7 +87523,7 @@
       .n(8)
       .k(8)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
   }
 
   TEST(QS8_GEMM_XW_MINMAX_2X8C8__AVX2, k_eq_8_strided_a) {
@@ -87538,7 +87538,7 @@
       .n(8)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
   }
 
   TEST(QS8_GEMM_XW_MINMAX_2X8C8__AVX2, k_eq_8_subtile) {
@@ -87555,7 +87555,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -87573,7 +87573,7 @@
         .n(8)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -87590,7 +87590,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -87606,7 +87606,7 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -87623,7 +87623,7 @@
         .n(8)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -87642,7 +87642,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
         }
       }
     }
@@ -87660,7 +87660,7 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -87677,7 +87677,7 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -87696,7 +87696,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
         }
       }
     }
@@ -87714,7 +87714,7 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -87731,7 +87731,7 @@
         .n(8)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -87750,7 +87750,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
         }
       }
     }
@@ -87769,7 +87769,7 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -87788,7 +87788,7 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -87807,7 +87807,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -87827,7 +87827,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
         }
       }
     }
@@ -87846,7 +87846,7 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -87865,7 +87865,7 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -87884,7 +87884,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -87904,7 +87904,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
         }
       }
     }
@@ -87926,7 +87926,7 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
         }
       }
     }
@@ -87944,7 +87944,7 @@
       .n(8)
       .k(8)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -87961,7 +87961,7 @@
       .m(3)
       .n(8)
       .k(8)
-      .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
   }
 
   TEST(QS8_GEMM_XW_MINMAX_3X8C8__AVX2, strided_cn) {
@@ -87976,7 +87976,7 @@
       .n(8)
       .k(8)
       .cn_stride(11)
-      .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
   }
 
   TEST(QS8_GEMM_XW_MINMAX_3X8C8__AVX2, k_eq_8_strided_a) {
@@ -87991,7 +87991,7 @@
       .n(8)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
   }
 
   TEST(QS8_GEMM_XW_MINMAX_3X8C8__AVX2, k_eq_8_subtile) {
@@ -88008,7 +88008,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -88026,7 +88026,7 @@
         .n(8)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -88043,7 +88043,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -88059,7 +88059,7 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -88076,7 +88076,7 @@
         .n(8)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -88095,7 +88095,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
         }
       }
     }
@@ -88113,7 +88113,7 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -88130,7 +88130,7 @@
         .n(8)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -88149,7 +88149,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
         }
       }
     }
@@ -88167,7 +88167,7 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -88184,7 +88184,7 @@
         .n(8)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -88203,7 +88203,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
         }
       }
     }
@@ -88222,7 +88222,7 @@
           .m(3)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -88241,7 +88241,7 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -88260,7 +88260,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -88280,7 +88280,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
         }
       }
     }
@@ -88299,7 +88299,7 @@
           .m(3)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -88318,7 +88318,7 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -88337,7 +88337,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -88357,7 +88357,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
         }
       }
     }
@@ -88379,7 +88379,7 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
         }
       }
     }
@@ -88397,7 +88397,7 @@
       .n(8)
       .k(8)
       .cm_stride(11)
-      .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -88413,7 +88413,7 @@
       .m(1)
       .n(16)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_1X16C8__AVX512SKX, strided_cn) {
@@ -88427,7 +88427,7 @@
       .n(16)
       .k(8)
       .cn_stride(19)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_1X16C8__AVX512SKX, k_eq_8_strided_a) {
@@ -88441,7 +88441,7 @@
       .n(16)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_1X16C8__AVX512SKX, k_eq_8_subtile) {
@@ -88457,7 +88457,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -88474,7 +88474,7 @@
         .n(16)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -88490,7 +88490,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -88505,7 +88505,7 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -88521,7 +88521,7 @@
         .n(16)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -88539,7 +88539,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -88556,7 +88556,7 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -88572,7 +88572,7 @@
         .n(16)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -88590,7 +88590,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -88607,7 +88607,7 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -88623,7 +88623,7 @@
         .n(16)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -88641,7 +88641,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -88659,7 +88659,7 @@
           .m(1)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -88677,7 +88677,7 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -88695,7 +88695,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -88714,7 +88714,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -88732,7 +88732,7 @@
           .m(1)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -88750,7 +88750,7 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -88768,7 +88768,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -88787,7 +88787,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -88808,7 +88808,7 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -88825,7 +88825,7 @@
       .n(16)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_1X16C8__AVX512SKX, qmax) {
@@ -88839,7 +88839,7 @@
       .n(16)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_1X16C8__AVX512SKX, strided_cm) {
@@ -88853,7 +88853,7 @@
       .n(16)
       .k(8)
       .cm_stride(19)
-      .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -88869,7 +88869,7 @@
       .m(2)
       .n(16)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_2X16C8__AVX512SKX, strided_cn) {
@@ -88883,7 +88883,7 @@
       .n(16)
       .k(8)
       .cn_stride(19)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_2X16C8__AVX512SKX, k_eq_8_strided_a) {
@@ -88897,7 +88897,7 @@
       .n(16)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_2X16C8__AVX512SKX, k_eq_8_subtile) {
@@ -88913,7 +88913,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -88930,7 +88930,7 @@
         .n(16)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -88946,7 +88946,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -88961,7 +88961,7 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -88977,7 +88977,7 @@
         .n(16)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -88995,7 +88995,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -89012,7 +89012,7 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -89028,7 +89028,7 @@
         .n(16)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -89046,7 +89046,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -89063,7 +89063,7 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -89079,7 +89079,7 @@
         .n(16)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -89097,7 +89097,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -89115,7 +89115,7 @@
           .m(2)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -89133,7 +89133,7 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -89151,7 +89151,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -89170,7 +89170,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -89188,7 +89188,7 @@
           .m(2)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -89206,7 +89206,7 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -89224,7 +89224,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -89243,7 +89243,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -89264,7 +89264,7 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -89281,7 +89281,7 @@
       .n(16)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_2X16C8__AVX512SKX, qmax) {
@@ -89295,7 +89295,7 @@
       .n(16)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_2X16C8__AVX512SKX, strided_cm) {
@@ -89309,7 +89309,7 @@
       .n(16)
       .k(8)
       .cm_stride(19)
-      .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -89325,7 +89325,7 @@
       .m(3)
       .n(16)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_3X16C8__AVX512SKX, strided_cn) {
@@ -89339,7 +89339,7 @@
       .n(16)
       .k(8)
       .cn_stride(19)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_3X16C8__AVX512SKX, k_eq_8_strided_a) {
@@ -89353,7 +89353,7 @@
       .n(16)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_3X16C8__AVX512SKX, k_eq_8_subtile) {
@@ -89369,7 +89369,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -89386,7 +89386,7 @@
         .n(16)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -89402,7 +89402,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -89417,7 +89417,7 @@
         .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -89433,7 +89433,7 @@
         .n(16)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -89451,7 +89451,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -89468,7 +89468,7 @@
         .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -89484,7 +89484,7 @@
         .n(16)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -89502,7 +89502,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -89519,7 +89519,7 @@
         .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -89535,7 +89535,7 @@
         .n(16)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -89553,7 +89553,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -89571,7 +89571,7 @@
           .m(3)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -89589,7 +89589,7 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -89607,7 +89607,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -89626,7 +89626,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -89644,7 +89644,7 @@
           .m(3)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -89662,7 +89662,7 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -89680,7 +89680,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -89699,7 +89699,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -89720,7 +89720,7 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -89737,7 +89737,7 @@
       .n(16)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_3X16C8__AVX512SKX, qmax) {
@@ -89751,7 +89751,7 @@
       .n(16)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_3X16C8__AVX512SKX, strided_cm) {
@@ -89765,7 +89765,7 @@
       .n(16)
       .k(8)
       .cm_stride(19)
-      .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -89781,7 +89781,7 @@
       .m(4)
       .n(16)
       .k(8)
-      .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_4X16C8__AVX512SKX, strided_cn) {
@@ -89795,7 +89795,7 @@
       .n(16)
       .k(8)
       .cn_stride(19)
-      .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_4X16C8__AVX512SKX, k_eq_8_strided_a) {
@@ -89809,7 +89809,7 @@
       .n(16)
       .k(8)
       .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_4X16C8__AVX512SKX, k_eq_8_subtile) {
@@ -89825,7 +89825,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -89842,7 +89842,7 @@
         .n(16)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -89858,7 +89858,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -89873,7 +89873,7 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -89889,7 +89889,7 @@
         .n(16)
         .k(k)
         .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -89907,7 +89907,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -89924,7 +89924,7 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -89940,7 +89940,7 @@
         .n(16)
         .k(k)
         .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -89958,7 +89958,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -89975,7 +89975,7 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -89991,7 +89991,7 @@
         .n(16)
         .k(k)
         .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -90009,7 +90009,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -90027,7 +90027,7 @@
           .m(4)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -90045,7 +90045,7 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -90063,7 +90063,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -90082,7 +90082,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -90100,7 +90100,7 @@
           .m(4)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -90118,7 +90118,7 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -90136,7 +90136,7 @@
           .n(n)
           .k(k)
           .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -90155,7 +90155,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -90176,7 +90176,7 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -90193,7 +90193,7 @@
       .n(16)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_4X16C8__AVX512SKX, qmax) {
@@ -90207,7 +90207,7 @@
       .n(16)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_GEMM_MINMAX_4X16C8__AVX512SKX, strided_cm) {
@@ -90221,7 +90221,7 @@
       .n(16)
       .k(8)
       .cm_stride(19)
-      .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
diff --git a/test/qs8-gemm-minmax.yaml b/test/qs8-gemm-minmax.yaml
index 41a669e..53fd93d 100644
--- a/test/qs8-gemm-minmax.yaml
+++ b/test/qs8-gemm-minmax.yaml
@@ -307,40 +307,40 @@
   init: xnn_init_qs8_gemm_sse2_params
   k-block: 8
 - name: xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld64
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld64
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld64
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld64
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld64
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld64
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld64
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld64
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld64
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld64
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_gemm_minmax_ukernel_1x4c2__sse2_ld128
   init: xnn_init_qs8_gemm_sse2_params
@@ -367,40 +367,40 @@
   init: xnn_init_qs8_gemm_sse2_params
   k-block: 8
 - name: xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld128
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_gemm_minmax_ukernel_2x4c2__sse41_ld128
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_gemm_minmax_ukernel_3x4c2__sse41_ld128
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_gemm_minmax_ukernel_1x4c2__avx_ld128
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_gemm_minmax_ukernel_2x4c2__avx_ld128
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_gemm_minmax_ukernel_3x4c2__avx_ld128
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_gemm_minmax_ukernel_4x4c2__avx_ld128
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld128
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_gemm_minmax_ukernel_2x4c2__xop_ld128
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_gemm_minmax_ukernel_3x4c2__xop_ld128
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse2
   init: xnn_init_qs8_gemm_sse2_params
@@ -415,22 +415,22 @@
   init: xnn_init_qs8_gemm_sse2_params
   k-block: 8
 - name: xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse41
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__avx
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__avx
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__xop
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_gemm_minmax_ukernel_1x4c8__sse2_ld64
   init: xnn_init_qs8_gemm_sse2_params
@@ -451,31 +451,31 @@
   init: xnn_init_qs8_gemm_sse2_params
   k-block: 8
 - name: xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld64
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld64
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld64
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld64
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld64
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld64
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld64
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld64
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld64
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_gemm_minmax_ukernel_1x4c8__sse2_ld128
   init: xnn_init_qs8_gemm_sse2_params
@@ -496,31 +496,31 @@
   init: xnn_init_qs8_gemm_sse2_params
   k-block: 8
 - name: xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld128
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld128
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_gemm_minmax_ukernel_1x4c8__avx_ld128
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_gemm_minmax_ukernel_2x4c8__avx_ld128
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_gemm_minmax_ukernel_3x4c8__avx_ld128
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld128
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld128
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse2
   init: xnn_init_qs8_gemm_sse2_params
@@ -541,61 +541,61 @@
   init: xnn_init_qs8_gemm_sse2_params
   k-block: 8
 - name: xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse41
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__sse41
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__sse41
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__avx
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__avx
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__avx
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__xop
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__xop
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__xop
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_avx2_params
   k-block: 8
 - name: xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_avx2_params
   k-block: 8
 - name: xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_avx2_params
   k-block: 8
 - name: xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_avx2_params
   k-block: 8
 - name: xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_avx2_params
   k-block: 8
 - name: xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_avx2_params
   k-block: 8
 - name: xnn_qs8_gemm_minmax_ukernel_1x16c8__avx512skx
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_gemm_minmax_ukernel_2x16c8__avx512skx
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_gemm_minmax_ukernel_3x16c8__avx512skx
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_gemm_minmax_ukernel_1x4c8__wasmsimd_ld64
   init: xnn_init_qs8_gemm_wasmsimd_params
diff --git a/test/qs8-igemm-minmax.cc b/test/qs8-igemm-minmax.cc
index b631389..fa33931 100644
--- a/test/qs8-igemm-minmax.cc
+++ b/test/qs8-igemm-minmax.cc
@@ -45429,7 +45429,7 @@
       .m(1)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_1X4C2__SSE41_LD64, strided_cn) {
@@ -45443,7 +45443,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_1X4C2__SSE41_LD64, k_eq_8_subtile) {
@@ -45459,7 +45459,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -45476,7 +45476,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -45492,7 +45492,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -45507,7 +45507,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -45525,7 +45525,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -45542,7 +45542,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -45560,7 +45560,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -45577,7 +45577,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -45595,7 +45595,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -45613,7 +45613,7 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -45631,7 +45631,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -45650,7 +45650,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -45668,7 +45668,7 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -45686,7 +45686,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -45705,7 +45705,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -45723,7 +45723,7 @@
         .n(4)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -45742,7 +45742,7 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -45761,7 +45761,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -45779,7 +45779,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -45799,7 +45799,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -45818,7 +45818,7 @@
         .k(k)
         .ks(3)
         .a_offset(43)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -45837,7 +45837,7 @@
           .ks(3)
           .a_offset(43)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -45853,7 +45853,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_1X4C2__SSE41_LD64, qmax) {
@@ -45867,7 +45867,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_1X4C2__SSE41_LD64, strided_cm) {
@@ -45881,7 +45881,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -45897,7 +45897,7 @@
       .m(2)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_2X4C2__SSE41_LD64, strided_cn) {
@@ -45911,7 +45911,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_2X4C2__SSE41_LD64, k_eq_8_subtile) {
@@ -45927,7 +45927,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -45944,7 +45944,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -45960,7 +45960,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -45975,7 +45975,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -45993,7 +45993,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -46010,7 +46010,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -46028,7 +46028,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -46045,7 +46045,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -46063,7 +46063,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -46081,7 +46081,7 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -46099,7 +46099,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -46118,7 +46118,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -46136,7 +46136,7 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -46154,7 +46154,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -46173,7 +46173,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -46191,7 +46191,7 @@
         .n(4)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -46210,7 +46210,7 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -46229,7 +46229,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -46247,7 +46247,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -46267,7 +46267,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -46286,7 +46286,7 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -46305,7 +46305,7 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -46321,7 +46321,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_2X4C2__SSE41_LD64, qmax) {
@@ -46335,7 +46335,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_2X4C2__SSE41_LD64, strided_cm) {
@@ -46349,7 +46349,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -46365,7 +46365,7 @@
       .m(3)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_3X4C2__SSE41_LD64, strided_cn) {
@@ -46379,7 +46379,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_3X4C2__SSE41_LD64, k_eq_8_subtile) {
@@ -46395,7 +46395,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -46412,7 +46412,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -46428,7 +46428,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -46443,7 +46443,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -46461,7 +46461,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -46478,7 +46478,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -46496,7 +46496,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -46513,7 +46513,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -46531,7 +46531,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -46549,7 +46549,7 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -46567,7 +46567,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -46586,7 +46586,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -46604,7 +46604,7 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -46622,7 +46622,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -46641,7 +46641,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -46659,7 +46659,7 @@
         .n(4)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -46678,7 +46678,7 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -46697,7 +46697,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -46715,7 +46715,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -46735,7 +46735,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -46754,7 +46754,7 @@
         .k(k)
         .ks(3)
         .a_offset(127)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -46773,7 +46773,7 @@
           .ks(3)
           .a_offset(127)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -46789,7 +46789,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_3X4C2__SSE41_LD64, qmax) {
@@ -46803,7 +46803,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_3X4C2__SSE41_LD64, strided_cm) {
@@ -46817,7 +46817,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -46833,7 +46833,7 @@
       .m(4)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_4X4C2__SSE41_LD64, strided_cn) {
@@ -46847,7 +46847,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_4X4C2__SSE41_LD64, k_eq_8_subtile) {
@@ -46863,7 +46863,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -46880,7 +46880,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -46896,7 +46896,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -46911,7 +46911,7 @@
         .m(4)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -46929,7 +46929,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -46946,7 +46946,7 @@
         .m(4)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -46964,7 +46964,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -46981,7 +46981,7 @@
         .m(4)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -46999,7 +46999,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -47017,7 +47017,7 @@
           .m(4)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -47035,7 +47035,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -47054,7 +47054,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -47072,7 +47072,7 @@
           .m(4)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -47090,7 +47090,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -47109,7 +47109,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -47127,7 +47127,7 @@
         .n(4)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -47146,7 +47146,7 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -47165,7 +47165,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -47183,7 +47183,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -47203,7 +47203,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -47222,7 +47222,7 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -47241,7 +47241,7 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -47257,7 +47257,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_4X4C2__SSE41_LD64, qmax) {
@@ -47271,7 +47271,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_4X4C2__SSE41_LD64, strided_cm) {
@@ -47285,7 +47285,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -47301,7 +47301,7 @@
       .m(1)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_1X4C2__XOP_LD64, strided_cn) {
@@ -47315,7 +47315,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_1X4C2__XOP_LD64, k_eq_8_subtile) {
@@ -47331,7 +47331,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -47348,7 +47348,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -47364,7 +47364,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -47379,7 +47379,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -47397,7 +47397,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -47414,7 +47414,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -47432,7 +47432,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -47449,7 +47449,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -47467,7 +47467,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -47485,7 +47485,7 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -47503,7 +47503,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -47522,7 +47522,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -47540,7 +47540,7 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -47558,7 +47558,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -47577,7 +47577,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -47595,7 +47595,7 @@
         .n(4)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -47614,7 +47614,7 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -47633,7 +47633,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -47651,7 +47651,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -47671,7 +47671,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -47690,7 +47690,7 @@
         .k(k)
         .ks(3)
         .a_offset(43)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -47709,7 +47709,7 @@
           .ks(3)
           .a_offset(43)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -47725,7 +47725,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_1X4C2__XOP_LD64, qmax) {
@@ -47739,7 +47739,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_1X4C2__XOP_LD64, strided_cm) {
@@ -47753,7 +47753,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -47769,7 +47769,7 @@
       .m(2)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_2X4C2__XOP_LD64, strided_cn) {
@@ -47783,7 +47783,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_2X4C2__XOP_LD64, k_eq_8_subtile) {
@@ -47799,7 +47799,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -47816,7 +47816,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -47832,7 +47832,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -47847,7 +47847,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -47865,7 +47865,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -47882,7 +47882,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -47900,7 +47900,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -47917,7 +47917,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -47935,7 +47935,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -47953,7 +47953,7 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -47971,7 +47971,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -47990,7 +47990,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -48008,7 +48008,7 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -48026,7 +48026,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -48045,7 +48045,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -48063,7 +48063,7 @@
         .n(4)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -48082,7 +48082,7 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -48101,7 +48101,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -48119,7 +48119,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -48139,7 +48139,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -48158,7 +48158,7 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -48177,7 +48177,7 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -48193,7 +48193,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_2X4C2__XOP_LD64, qmax) {
@@ -48207,7 +48207,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_2X4C2__XOP_LD64, strided_cm) {
@@ -48221,7 +48221,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -48237,7 +48237,7 @@
       .m(3)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_3X4C2__XOP_LD64, strided_cn) {
@@ -48251,7 +48251,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_3X4C2__XOP_LD64, k_eq_8_subtile) {
@@ -48267,7 +48267,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -48284,7 +48284,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -48300,7 +48300,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -48315,7 +48315,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -48333,7 +48333,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -48350,7 +48350,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -48368,7 +48368,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -48385,7 +48385,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -48403,7 +48403,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -48421,7 +48421,7 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -48439,7 +48439,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -48458,7 +48458,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -48476,7 +48476,7 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -48494,7 +48494,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -48513,7 +48513,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -48531,7 +48531,7 @@
         .n(4)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -48550,7 +48550,7 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -48569,7 +48569,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -48587,7 +48587,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -48607,7 +48607,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -48626,7 +48626,7 @@
         .k(k)
         .ks(3)
         .a_offset(127)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -48645,7 +48645,7 @@
           .ks(3)
           .a_offset(127)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -48661,7 +48661,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_3X4C2__XOP_LD64, qmax) {
@@ -48675,7 +48675,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_3X4C2__XOP_LD64, strided_cm) {
@@ -48689,7 +48689,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -48705,7 +48705,7 @@
       .m(4)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_4X4C2__XOP_LD64, strided_cn) {
@@ -48719,7 +48719,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_4X4C2__XOP_LD64, k_eq_8_subtile) {
@@ -48735,7 +48735,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -48752,7 +48752,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -48768,7 +48768,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -48783,7 +48783,7 @@
         .m(4)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -48801,7 +48801,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -48818,7 +48818,7 @@
         .m(4)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -48836,7 +48836,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -48853,7 +48853,7 @@
         .m(4)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -48871,7 +48871,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -48889,7 +48889,7 @@
           .m(4)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -48907,7 +48907,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -48926,7 +48926,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -48944,7 +48944,7 @@
           .m(4)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -48962,7 +48962,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -48981,7 +48981,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -48999,7 +48999,7 @@
         .n(4)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -49018,7 +49018,7 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -49037,7 +49037,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -49055,7 +49055,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -49075,7 +49075,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -49094,7 +49094,7 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -49113,7 +49113,7 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -49129,7 +49129,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_4X4C2__XOP_LD64, qmax) {
@@ -49143,7 +49143,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_4X4C2__XOP_LD64, strided_cm) {
@@ -49157,7 +49157,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -52917,7 +52917,7 @@
       .m(1)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_1X4C2__SSE41_LD128, strided_cn) {
@@ -52931,7 +52931,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_1X4C2__SSE41_LD128, k_eq_8_subtile) {
@@ -52947,7 +52947,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -52964,7 +52964,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -52980,7 +52980,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -52995,7 +52995,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -53013,7 +53013,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -53030,7 +53030,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -53048,7 +53048,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -53065,7 +53065,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -53083,7 +53083,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -53101,7 +53101,7 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -53119,7 +53119,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -53138,7 +53138,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -53156,7 +53156,7 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -53174,7 +53174,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -53193,7 +53193,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -53211,7 +53211,7 @@
         .n(4)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -53230,7 +53230,7 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -53249,7 +53249,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -53267,7 +53267,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -53287,7 +53287,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -53306,7 +53306,7 @@
         .k(k)
         .ks(3)
         .a_offset(43)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -53325,7 +53325,7 @@
           .ks(3)
           .a_offset(43)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -53341,7 +53341,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_1X4C2__SSE41_LD128, qmax) {
@@ -53355,7 +53355,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_1X4C2__SSE41_LD128, strided_cm) {
@@ -53369,7 +53369,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -53385,7 +53385,7 @@
       .m(2)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_2X4C2__SSE41_LD128, strided_cn) {
@@ -53399,7 +53399,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_2X4C2__SSE41_LD128, k_eq_8_subtile) {
@@ -53415,7 +53415,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -53432,7 +53432,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -53448,7 +53448,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -53463,7 +53463,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -53481,7 +53481,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -53498,7 +53498,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -53516,7 +53516,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -53533,7 +53533,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -53551,7 +53551,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -53569,7 +53569,7 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -53587,7 +53587,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -53606,7 +53606,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -53624,7 +53624,7 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -53642,7 +53642,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -53661,7 +53661,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -53679,7 +53679,7 @@
         .n(4)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -53698,7 +53698,7 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -53717,7 +53717,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -53735,7 +53735,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -53755,7 +53755,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -53774,7 +53774,7 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -53793,7 +53793,7 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -53809,7 +53809,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_2X4C2__SSE41_LD128, qmax) {
@@ -53823,7 +53823,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_2X4C2__SSE41_LD128, strided_cm) {
@@ -53837,7 +53837,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -53853,7 +53853,7 @@
       .m(3)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_3X4C2__SSE41_LD128, strided_cn) {
@@ -53867,7 +53867,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_3X4C2__SSE41_LD128, k_eq_8_subtile) {
@@ -53883,7 +53883,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -53900,7 +53900,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -53916,7 +53916,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -53931,7 +53931,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -53949,7 +53949,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -53966,7 +53966,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -53984,7 +53984,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -54001,7 +54001,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -54019,7 +54019,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -54037,7 +54037,7 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -54055,7 +54055,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -54074,7 +54074,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -54092,7 +54092,7 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -54110,7 +54110,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -54129,7 +54129,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -54147,7 +54147,7 @@
         .n(4)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -54166,7 +54166,7 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -54185,7 +54185,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -54203,7 +54203,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -54223,7 +54223,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -54242,7 +54242,7 @@
         .k(k)
         .ks(3)
         .a_offset(127)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -54261,7 +54261,7 @@
           .ks(3)
           .a_offset(127)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -54277,7 +54277,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_3X4C2__SSE41_LD128, qmax) {
@@ -54291,7 +54291,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_3X4C2__SSE41_LD128, strided_cm) {
@@ -54305,7 +54305,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -54321,7 +54321,7 @@
       .m(4)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_4X4C2__SSE41_LD128, strided_cn) {
@@ -54335,7 +54335,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_4X4C2__SSE41_LD128, k_eq_8_subtile) {
@@ -54351,7 +54351,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -54368,7 +54368,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -54384,7 +54384,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -54399,7 +54399,7 @@
         .m(4)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -54417,7 +54417,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -54434,7 +54434,7 @@
         .m(4)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -54452,7 +54452,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -54469,7 +54469,7 @@
         .m(4)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -54487,7 +54487,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -54505,7 +54505,7 @@
           .m(4)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -54523,7 +54523,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -54542,7 +54542,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -54560,7 +54560,7 @@
           .m(4)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -54578,7 +54578,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -54597,7 +54597,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -54615,7 +54615,7 @@
         .n(4)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -54634,7 +54634,7 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -54653,7 +54653,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -54671,7 +54671,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -54691,7 +54691,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -54710,7 +54710,7 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -54729,7 +54729,7 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -54745,7 +54745,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_4X4C2__SSE41_LD128, qmax) {
@@ -54759,7 +54759,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_4X4C2__SSE41_LD128, strided_cm) {
@@ -54773,7 +54773,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -54789,7 +54789,7 @@
       .m(1)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_1X4C2__AVX_LD128, strided_cn) {
@@ -54803,7 +54803,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_1X4C2__AVX_LD128, k_eq_8_subtile) {
@@ -54819,7 +54819,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -54836,7 +54836,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -54852,7 +54852,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -54867,7 +54867,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -54885,7 +54885,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -54902,7 +54902,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -54920,7 +54920,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -54937,7 +54937,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -54955,7 +54955,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -54973,7 +54973,7 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -54991,7 +54991,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -55010,7 +55010,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -55028,7 +55028,7 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -55046,7 +55046,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -55065,7 +55065,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -55083,7 +55083,7 @@
         .n(4)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -55102,7 +55102,7 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -55121,7 +55121,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -55139,7 +55139,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -55159,7 +55159,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -55178,7 +55178,7 @@
         .k(k)
         .ks(3)
         .a_offset(43)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -55197,7 +55197,7 @@
           .ks(3)
           .a_offset(43)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -55213,7 +55213,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_1X4C2__AVX_LD128, qmax) {
@@ -55227,7 +55227,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_1X4C2__AVX_LD128, strided_cm) {
@@ -55241,7 +55241,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -55257,7 +55257,7 @@
       .m(2)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_2X4C2__AVX_LD128, strided_cn) {
@@ -55271,7 +55271,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_2X4C2__AVX_LD128, k_eq_8_subtile) {
@@ -55287,7 +55287,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -55304,7 +55304,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -55320,7 +55320,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -55335,7 +55335,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -55353,7 +55353,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -55370,7 +55370,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -55388,7 +55388,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -55405,7 +55405,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -55423,7 +55423,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -55441,7 +55441,7 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -55459,7 +55459,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -55478,7 +55478,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -55496,7 +55496,7 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -55514,7 +55514,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -55533,7 +55533,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -55551,7 +55551,7 @@
         .n(4)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -55570,7 +55570,7 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -55589,7 +55589,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -55607,7 +55607,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -55627,7 +55627,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -55646,7 +55646,7 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -55665,7 +55665,7 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -55681,7 +55681,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_2X4C2__AVX_LD128, qmax) {
@@ -55695,7 +55695,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_2X4C2__AVX_LD128, strided_cm) {
@@ -55709,7 +55709,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -55725,7 +55725,7 @@
       .m(3)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_3X4C2__AVX_LD128, strided_cn) {
@@ -55739,7 +55739,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_3X4C2__AVX_LD128, k_eq_8_subtile) {
@@ -55755,7 +55755,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -55772,7 +55772,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -55788,7 +55788,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -55803,7 +55803,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -55821,7 +55821,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -55838,7 +55838,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -55856,7 +55856,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -55873,7 +55873,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -55891,7 +55891,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -55909,7 +55909,7 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -55927,7 +55927,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -55946,7 +55946,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -55964,7 +55964,7 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -55982,7 +55982,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -56001,7 +56001,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -56019,7 +56019,7 @@
         .n(4)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -56038,7 +56038,7 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -56057,7 +56057,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -56075,7 +56075,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -56095,7 +56095,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -56114,7 +56114,7 @@
         .k(k)
         .ks(3)
         .a_offset(127)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -56133,7 +56133,7 @@
           .ks(3)
           .a_offset(127)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -56149,7 +56149,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_3X4C2__AVX_LD128, qmax) {
@@ -56163,7 +56163,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_3X4C2__AVX_LD128, strided_cm) {
@@ -56177,7 +56177,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -56193,7 +56193,7 @@
       .m(4)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_4X4C2__AVX_LD128, strided_cn) {
@@ -56207,7 +56207,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_4X4C2__AVX_LD128, k_eq_8_subtile) {
@@ -56223,7 +56223,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -56240,7 +56240,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -56256,7 +56256,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -56271,7 +56271,7 @@
         .m(4)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -56289,7 +56289,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -56306,7 +56306,7 @@
         .m(4)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -56324,7 +56324,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -56341,7 +56341,7 @@
         .m(4)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -56359,7 +56359,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -56377,7 +56377,7 @@
           .m(4)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -56395,7 +56395,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -56414,7 +56414,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -56432,7 +56432,7 @@
           .m(4)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -56450,7 +56450,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -56469,7 +56469,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -56487,7 +56487,7 @@
         .n(4)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -56506,7 +56506,7 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -56525,7 +56525,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -56543,7 +56543,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -56563,7 +56563,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -56582,7 +56582,7 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -56601,7 +56601,7 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -56617,7 +56617,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_4X4C2__AVX_LD128, qmax) {
@@ -56631,7 +56631,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_4X4C2__AVX_LD128, strided_cm) {
@@ -56645,7 +56645,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -56661,7 +56661,7 @@
       .m(1)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_1X4C2__XOP_LD128, strided_cn) {
@@ -56675,7 +56675,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_1X4C2__XOP_LD128, k_eq_8_subtile) {
@@ -56691,7 +56691,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -56708,7 +56708,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -56724,7 +56724,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -56739,7 +56739,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -56757,7 +56757,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -56774,7 +56774,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -56792,7 +56792,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -56809,7 +56809,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -56827,7 +56827,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -56845,7 +56845,7 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -56863,7 +56863,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -56882,7 +56882,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -56900,7 +56900,7 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -56918,7 +56918,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -56937,7 +56937,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -56955,7 +56955,7 @@
         .n(4)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -56974,7 +56974,7 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -56993,7 +56993,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -57011,7 +57011,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -57031,7 +57031,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -57050,7 +57050,7 @@
         .k(k)
         .ks(3)
         .a_offset(43)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -57069,7 +57069,7 @@
           .ks(3)
           .a_offset(43)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -57085,7 +57085,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_1X4C2__XOP_LD128, qmax) {
@@ -57099,7 +57099,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_1X4C2__XOP_LD128, strided_cm) {
@@ -57113,7 +57113,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -57129,7 +57129,7 @@
       .m(2)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_2X4C2__XOP_LD128, strided_cn) {
@@ -57143,7 +57143,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_2X4C2__XOP_LD128, k_eq_8_subtile) {
@@ -57159,7 +57159,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -57176,7 +57176,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -57192,7 +57192,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -57207,7 +57207,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -57225,7 +57225,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -57242,7 +57242,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -57260,7 +57260,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -57277,7 +57277,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -57295,7 +57295,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -57313,7 +57313,7 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -57331,7 +57331,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -57350,7 +57350,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -57368,7 +57368,7 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -57386,7 +57386,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -57405,7 +57405,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -57423,7 +57423,7 @@
         .n(4)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -57442,7 +57442,7 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -57461,7 +57461,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -57479,7 +57479,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -57499,7 +57499,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -57518,7 +57518,7 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -57537,7 +57537,7 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -57553,7 +57553,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_2X4C2__XOP_LD128, qmax) {
@@ -57567,7 +57567,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_2X4C2__XOP_LD128, strided_cm) {
@@ -57581,7 +57581,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -57597,7 +57597,7 @@
       .m(3)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_3X4C2__XOP_LD128, strided_cn) {
@@ -57611,7 +57611,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_3X4C2__XOP_LD128, k_eq_8_subtile) {
@@ -57627,7 +57627,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -57644,7 +57644,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -57660,7 +57660,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -57675,7 +57675,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -57693,7 +57693,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -57710,7 +57710,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -57728,7 +57728,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -57745,7 +57745,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -57763,7 +57763,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -57781,7 +57781,7 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -57799,7 +57799,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -57818,7 +57818,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -57836,7 +57836,7 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -57854,7 +57854,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -57873,7 +57873,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -57891,7 +57891,7 @@
         .n(4)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -57910,7 +57910,7 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -57929,7 +57929,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -57947,7 +57947,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -57967,7 +57967,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -57986,7 +57986,7 @@
         .k(k)
         .ks(3)
         .a_offset(127)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -58005,7 +58005,7 @@
           .ks(3)
           .a_offset(127)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -58021,7 +58021,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_3X4C2__XOP_LD128, qmax) {
@@ -58035,7 +58035,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_3X4C2__XOP_LD128, strided_cm) {
@@ -58049,7 +58049,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -58065,7 +58065,7 @@
       .m(4)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_4X4C2__XOP_LD128, strided_cn) {
@@ -58079,7 +58079,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_4X4C2__XOP_LD128, k_eq_8_subtile) {
@@ -58095,7 +58095,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -58112,7 +58112,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -58128,7 +58128,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -58143,7 +58143,7 @@
         .m(4)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -58161,7 +58161,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -58178,7 +58178,7 @@
         .m(4)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -58196,7 +58196,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -58213,7 +58213,7 @@
         .m(4)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -58231,7 +58231,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -58249,7 +58249,7 @@
           .m(4)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -58267,7 +58267,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -58286,7 +58286,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -58304,7 +58304,7 @@
           .m(4)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -58322,7 +58322,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -58341,7 +58341,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -58359,7 +58359,7 @@
         .n(4)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -58378,7 +58378,7 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -58397,7 +58397,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -58415,7 +58415,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -58435,7 +58435,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -58454,7 +58454,7 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -58473,7 +58473,7 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -58489,7 +58489,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_4X4C2__XOP_LD128, qmax) {
@@ -58503,7 +58503,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_4X4C2__XOP_LD128, strided_cm) {
@@ -58517,7 +58517,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -61341,7 +61341,7 @@
       .m(1)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_1X4C8__SSE41_LD64, strided_cn) {
@@ -61355,7 +61355,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_1X4C8__SSE41_LD64, k_eq_8_subtile) {
@@ -61371,7 +61371,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -61388,7 +61388,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -61404,7 +61404,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -61419,7 +61419,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -61437,7 +61437,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -61454,7 +61454,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -61472,7 +61472,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -61489,7 +61489,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -61507,7 +61507,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -61525,7 +61525,7 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -61543,7 +61543,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -61562,7 +61562,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -61580,7 +61580,7 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -61598,7 +61598,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -61617,7 +61617,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -61635,7 +61635,7 @@
         .n(4)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -61654,7 +61654,7 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -61673,7 +61673,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -61691,7 +61691,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -61711,7 +61711,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -61730,7 +61730,7 @@
         .k(k)
         .ks(3)
         .a_offset(43)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -61749,7 +61749,7 @@
           .ks(3)
           .a_offset(43)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -61765,7 +61765,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_1X4C8__SSE41_LD64, qmax) {
@@ -61779,7 +61779,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_1X4C8__SSE41_LD64, strided_cm) {
@@ -61793,7 +61793,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -61809,7 +61809,7 @@
       .m(2)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_2X4C8__SSE41_LD64, strided_cn) {
@@ -61823,7 +61823,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_2X4C8__SSE41_LD64, k_eq_8_subtile) {
@@ -61839,7 +61839,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -61856,7 +61856,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -61872,7 +61872,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -61887,7 +61887,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -61905,7 +61905,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -61922,7 +61922,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -61940,7 +61940,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -61957,7 +61957,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -61975,7 +61975,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -61993,7 +61993,7 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -62011,7 +62011,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -62030,7 +62030,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -62048,7 +62048,7 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -62066,7 +62066,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -62085,7 +62085,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -62103,7 +62103,7 @@
         .n(4)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -62122,7 +62122,7 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -62141,7 +62141,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -62159,7 +62159,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -62179,7 +62179,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -62198,7 +62198,7 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -62217,7 +62217,7 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -62233,7 +62233,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_2X4C8__SSE41_LD64, qmax) {
@@ -62247,7 +62247,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_2X4C8__SSE41_LD64, strided_cm) {
@@ -62261,7 +62261,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -62277,7 +62277,7 @@
       .m(3)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_3X4C8__SSE41_LD64, strided_cn) {
@@ -62291,7 +62291,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_3X4C8__SSE41_LD64, k_eq_8_subtile) {
@@ -62307,7 +62307,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -62324,7 +62324,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -62340,7 +62340,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -62355,7 +62355,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -62373,7 +62373,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -62390,7 +62390,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -62408,7 +62408,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -62425,7 +62425,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -62443,7 +62443,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -62461,7 +62461,7 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -62479,7 +62479,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -62498,7 +62498,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -62516,7 +62516,7 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -62534,7 +62534,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -62553,7 +62553,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -62571,7 +62571,7 @@
         .n(4)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -62590,7 +62590,7 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -62609,7 +62609,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -62627,7 +62627,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -62647,7 +62647,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -62666,7 +62666,7 @@
         .k(k)
         .ks(3)
         .a_offset(127)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -62685,7 +62685,7 @@
           .ks(3)
           .a_offset(127)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -62701,7 +62701,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_3X4C8__SSE41_LD64, qmax) {
@@ -62715,7 +62715,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_3X4C8__SSE41_LD64, strided_cm) {
@@ -62729,7 +62729,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -62745,7 +62745,7 @@
       .m(1)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_1X4C8__AVX_LD64, strided_cn) {
@@ -62759,7 +62759,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_1X4C8__AVX_LD64, k_eq_8_subtile) {
@@ -62775,7 +62775,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -62792,7 +62792,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -62808,7 +62808,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -62823,7 +62823,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -62841,7 +62841,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -62858,7 +62858,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -62876,7 +62876,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -62893,7 +62893,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -62911,7 +62911,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -62929,7 +62929,7 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -62947,7 +62947,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -62966,7 +62966,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -62984,7 +62984,7 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -63002,7 +63002,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -63021,7 +63021,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -63039,7 +63039,7 @@
         .n(4)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -63058,7 +63058,7 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -63077,7 +63077,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -63095,7 +63095,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -63115,7 +63115,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -63134,7 +63134,7 @@
         .k(k)
         .ks(3)
         .a_offset(43)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -63153,7 +63153,7 @@
           .ks(3)
           .a_offset(43)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -63169,7 +63169,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_1X4C8__AVX_LD64, qmax) {
@@ -63183,7 +63183,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_1X4C8__AVX_LD64, strided_cm) {
@@ -63197,7 +63197,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -63213,7 +63213,7 @@
       .m(2)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_2X4C8__AVX_LD64, strided_cn) {
@@ -63227,7 +63227,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_2X4C8__AVX_LD64, k_eq_8_subtile) {
@@ -63243,7 +63243,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -63260,7 +63260,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -63276,7 +63276,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -63291,7 +63291,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -63309,7 +63309,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -63326,7 +63326,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -63344,7 +63344,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -63361,7 +63361,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -63379,7 +63379,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -63397,7 +63397,7 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -63415,7 +63415,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -63434,7 +63434,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -63452,7 +63452,7 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -63470,7 +63470,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -63489,7 +63489,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -63507,7 +63507,7 @@
         .n(4)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -63526,7 +63526,7 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -63545,7 +63545,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -63563,7 +63563,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -63583,7 +63583,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -63602,7 +63602,7 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -63621,7 +63621,7 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -63637,7 +63637,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_2X4C8__AVX_LD64, qmax) {
@@ -63651,7 +63651,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_2X4C8__AVX_LD64, strided_cm) {
@@ -63665,7 +63665,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -63681,7 +63681,7 @@
       .m(3)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_3X4C8__AVX_LD64, strided_cn) {
@@ -63695,7 +63695,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_3X4C8__AVX_LD64, k_eq_8_subtile) {
@@ -63711,7 +63711,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -63728,7 +63728,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -63744,7 +63744,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -63759,7 +63759,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -63777,7 +63777,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -63794,7 +63794,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -63812,7 +63812,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -63829,7 +63829,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -63847,7 +63847,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -63865,7 +63865,7 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -63883,7 +63883,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -63902,7 +63902,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -63920,7 +63920,7 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -63938,7 +63938,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -63957,7 +63957,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -63975,7 +63975,7 @@
         .n(4)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -63994,7 +63994,7 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -64013,7 +64013,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -64031,7 +64031,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -64051,7 +64051,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -64070,7 +64070,7 @@
         .k(k)
         .ks(3)
         .a_offset(127)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -64089,7 +64089,7 @@
           .ks(3)
           .a_offset(127)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -64105,7 +64105,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_3X4C8__AVX_LD64, qmax) {
@@ -64119,7 +64119,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_3X4C8__AVX_LD64, strided_cm) {
@@ -64133,7 +64133,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -64149,7 +64149,7 @@
       .m(1)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_1X4C8__XOP_LD64, strided_cn) {
@@ -64163,7 +64163,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_1X4C8__XOP_LD64, k_eq_8_subtile) {
@@ -64179,7 +64179,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -64196,7 +64196,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -64212,7 +64212,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -64227,7 +64227,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -64245,7 +64245,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -64262,7 +64262,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -64280,7 +64280,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -64297,7 +64297,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -64315,7 +64315,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -64333,7 +64333,7 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -64351,7 +64351,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -64370,7 +64370,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -64388,7 +64388,7 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -64406,7 +64406,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -64425,7 +64425,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -64443,7 +64443,7 @@
         .n(4)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -64462,7 +64462,7 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -64481,7 +64481,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -64499,7 +64499,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -64519,7 +64519,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -64538,7 +64538,7 @@
         .k(k)
         .ks(3)
         .a_offset(43)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -64557,7 +64557,7 @@
           .ks(3)
           .a_offset(43)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -64573,7 +64573,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_1X4C8__XOP_LD64, qmax) {
@@ -64587,7 +64587,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_1X4C8__XOP_LD64, strided_cm) {
@@ -64601,7 +64601,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -64617,7 +64617,7 @@
       .m(2)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_2X4C8__XOP_LD64, strided_cn) {
@@ -64631,7 +64631,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_2X4C8__XOP_LD64, k_eq_8_subtile) {
@@ -64647,7 +64647,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -64664,7 +64664,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -64680,7 +64680,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -64695,7 +64695,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -64713,7 +64713,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -64730,7 +64730,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -64748,7 +64748,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -64765,7 +64765,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -64783,7 +64783,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -64801,7 +64801,7 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -64819,7 +64819,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -64838,7 +64838,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -64856,7 +64856,7 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -64874,7 +64874,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -64893,7 +64893,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -64911,7 +64911,7 @@
         .n(4)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -64930,7 +64930,7 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -64949,7 +64949,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -64967,7 +64967,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -64987,7 +64987,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -65006,7 +65006,7 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -65025,7 +65025,7 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -65041,7 +65041,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_2X4C8__XOP_LD64, qmax) {
@@ -65055,7 +65055,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_2X4C8__XOP_LD64, strided_cm) {
@@ -65069,7 +65069,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -65085,7 +65085,7 @@
       .m(3)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_3X4C8__XOP_LD64, strided_cn) {
@@ -65099,7 +65099,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_3X4C8__XOP_LD64, k_eq_8_subtile) {
@@ -65115,7 +65115,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -65132,7 +65132,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -65148,7 +65148,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -65163,7 +65163,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -65181,7 +65181,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -65198,7 +65198,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -65216,7 +65216,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -65233,7 +65233,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -65251,7 +65251,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -65269,7 +65269,7 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -65287,7 +65287,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -65306,7 +65306,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -65324,7 +65324,7 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -65342,7 +65342,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -65361,7 +65361,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -65379,7 +65379,7 @@
         .n(4)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -65398,7 +65398,7 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -65417,7 +65417,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -65435,7 +65435,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -65455,7 +65455,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -65474,7 +65474,7 @@
         .k(k)
         .ks(3)
         .a_offset(127)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -65493,7 +65493,7 @@
           .ks(3)
           .a_offset(127)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -65509,7 +65509,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_3X4C8__XOP_LD64, qmax) {
@@ -65523,7 +65523,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_3X4C8__XOP_LD64, strided_cm) {
@@ -65537,7 +65537,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld64, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -68361,7 +68361,7 @@
       .m(1)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_1X4C8__SSE41_LD128, strided_cn) {
@@ -68375,7 +68375,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_1X4C8__SSE41_LD128, k_eq_8_subtile) {
@@ -68391,7 +68391,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -68408,7 +68408,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -68424,7 +68424,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -68439,7 +68439,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -68457,7 +68457,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -68474,7 +68474,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -68492,7 +68492,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -68509,7 +68509,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -68527,7 +68527,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -68545,7 +68545,7 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -68563,7 +68563,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -68582,7 +68582,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -68600,7 +68600,7 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -68618,7 +68618,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -68637,7 +68637,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -68655,7 +68655,7 @@
         .n(4)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -68674,7 +68674,7 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -68693,7 +68693,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -68711,7 +68711,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -68731,7 +68731,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -68750,7 +68750,7 @@
         .k(k)
         .ks(3)
         .a_offset(43)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -68769,7 +68769,7 @@
           .ks(3)
           .a_offset(43)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -68785,7 +68785,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_1X4C8__SSE41_LD128, qmax) {
@@ -68799,7 +68799,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_1X4C8__SSE41_LD128, strided_cm) {
@@ -68813,7 +68813,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -68829,7 +68829,7 @@
       .m(2)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_2X4C8__SSE41_LD128, strided_cn) {
@@ -68843,7 +68843,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_2X4C8__SSE41_LD128, k_eq_8_subtile) {
@@ -68859,7 +68859,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -68876,7 +68876,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -68892,7 +68892,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -68907,7 +68907,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -68925,7 +68925,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -68942,7 +68942,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -68960,7 +68960,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -68977,7 +68977,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -68995,7 +68995,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -69013,7 +69013,7 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -69031,7 +69031,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -69050,7 +69050,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -69068,7 +69068,7 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -69086,7 +69086,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -69105,7 +69105,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -69123,7 +69123,7 @@
         .n(4)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -69142,7 +69142,7 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -69161,7 +69161,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -69179,7 +69179,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -69199,7 +69199,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -69218,7 +69218,7 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -69237,7 +69237,7 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -69253,7 +69253,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_2X4C8__SSE41_LD128, qmax) {
@@ -69267,7 +69267,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_2X4C8__SSE41_LD128, strided_cm) {
@@ -69281,7 +69281,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -69297,7 +69297,7 @@
       .m(3)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_3X4C8__SSE41_LD128, strided_cn) {
@@ -69311,7 +69311,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_3X4C8__SSE41_LD128, k_eq_8_subtile) {
@@ -69327,7 +69327,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -69344,7 +69344,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -69360,7 +69360,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -69375,7 +69375,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -69393,7 +69393,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -69410,7 +69410,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -69428,7 +69428,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -69445,7 +69445,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -69463,7 +69463,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -69481,7 +69481,7 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -69499,7 +69499,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -69518,7 +69518,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -69536,7 +69536,7 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -69554,7 +69554,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -69573,7 +69573,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -69591,7 +69591,7 @@
         .n(4)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -69610,7 +69610,7 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -69629,7 +69629,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -69647,7 +69647,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -69667,7 +69667,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -69686,7 +69686,7 @@
         .k(k)
         .ks(3)
         .a_offset(127)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -69705,7 +69705,7 @@
           .ks(3)
           .a_offset(127)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -69721,7 +69721,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_3X4C8__SSE41_LD128, qmax) {
@@ -69735,7 +69735,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_3X4C8__SSE41_LD128, strided_cm) {
@@ -69749,7 +69749,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -69765,7 +69765,7 @@
       .m(1)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_1X4C8__AVX_LD128, strided_cn) {
@@ -69779,7 +69779,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_1X4C8__AVX_LD128, k_eq_8_subtile) {
@@ -69795,7 +69795,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -69812,7 +69812,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -69828,7 +69828,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -69843,7 +69843,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -69861,7 +69861,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -69878,7 +69878,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -69896,7 +69896,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -69913,7 +69913,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -69931,7 +69931,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -69949,7 +69949,7 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -69967,7 +69967,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -69986,7 +69986,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -70004,7 +70004,7 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -70022,7 +70022,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -70041,7 +70041,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -70059,7 +70059,7 @@
         .n(4)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -70078,7 +70078,7 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -70097,7 +70097,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -70115,7 +70115,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -70135,7 +70135,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -70154,7 +70154,7 @@
         .k(k)
         .ks(3)
         .a_offset(43)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -70173,7 +70173,7 @@
           .ks(3)
           .a_offset(43)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -70189,7 +70189,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_1X4C8__AVX_LD128, qmax) {
@@ -70203,7 +70203,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_1X4C8__AVX_LD128, strided_cm) {
@@ -70217,7 +70217,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -70233,7 +70233,7 @@
       .m(2)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_2X4C8__AVX_LD128, strided_cn) {
@@ -70247,7 +70247,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_2X4C8__AVX_LD128, k_eq_8_subtile) {
@@ -70263,7 +70263,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -70280,7 +70280,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -70296,7 +70296,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -70311,7 +70311,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -70329,7 +70329,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -70346,7 +70346,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -70364,7 +70364,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -70381,7 +70381,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -70399,7 +70399,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -70417,7 +70417,7 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -70435,7 +70435,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -70454,7 +70454,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -70472,7 +70472,7 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -70490,7 +70490,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -70509,7 +70509,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -70527,7 +70527,7 @@
         .n(4)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -70546,7 +70546,7 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -70565,7 +70565,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -70583,7 +70583,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -70603,7 +70603,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -70622,7 +70622,7 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -70641,7 +70641,7 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -70657,7 +70657,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_2X4C8__AVX_LD128, qmax) {
@@ -70671,7 +70671,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_2X4C8__AVX_LD128, strided_cm) {
@@ -70685,7 +70685,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -70701,7 +70701,7 @@
       .m(3)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_3X4C8__AVX_LD128, strided_cn) {
@@ -70715,7 +70715,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_3X4C8__AVX_LD128, k_eq_8_subtile) {
@@ -70731,7 +70731,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -70748,7 +70748,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -70764,7 +70764,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -70779,7 +70779,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -70797,7 +70797,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -70814,7 +70814,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -70832,7 +70832,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -70849,7 +70849,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -70867,7 +70867,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -70885,7 +70885,7 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -70903,7 +70903,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -70922,7 +70922,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -70940,7 +70940,7 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -70958,7 +70958,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -70977,7 +70977,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -70995,7 +70995,7 @@
         .n(4)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -71014,7 +71014,7 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -71033,7 +71033,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -71051,7 +71051,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -71071,7 +71071,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -71090,7 +71090,7 @@
         .k(k)
         .ks(3)
         .a_offset(127)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -71109,7 +71109,7 @@
           .ks(3)
           .a_offset(127)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -71125,7 +71125,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_3X4C8__AVX_LD128, qmax) {
@@ -71139,7 +71139,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_3X4C8__AVX_LD128, strided_cm) {
@@ -71153,7 +71153,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -71169,7 +71169,7 @@
       .m(1)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_1X4C8__XOP_LD128, strided_cn) {
@@ -71183,7 +71183,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_1X4C8__XOP_LD128, k_eq_8_subtile) {
@@ -71199,7 +71199,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -71216,7 +71216,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -71232,7 +71232,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -71247,7 +71247,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -71265,7 +71265,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -71282,7 +71282,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -71300,7 +71300,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -71317,7 +71317,7 @@
         .m(1)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -71335,7 +71335,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -71353,7 +71353,7 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -71371,7 +71371,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -71390,7 +71390,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -71408,7 +71408,7 @@
           .m(1)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -71426,7 +71426,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -71445,7 +71445,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -71463,7 +71463,7 @@
         .n(4)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -71482,7 +71482,7 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -71501,7 +71501,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -71519,7 +71519,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -71539,7 +71539,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -71558,7 +71558,7 @@
         .k(k)
         .ks(3)
         .a_offset(43)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -71577,7 +71577,7 @@
           .ks(3)
           .a_offset(43)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -71593,7 +71593,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_1X4C8__XOP_LD128, qmax) {
@@ -71607,7 +71607,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_1X4C8__XOP_LD128, strided_cm) {
@@ -71621,7 +71621,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -71637,7 +71637,7 @@
       .m(2)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_2X4C8__XOP_LD128, strided_cn) {
@@ -71651,7 +71651,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_2X4C8__XOP_LD128, k_eq_8_subtile) {
@@ -71667,7 +71667,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -71684,7 +71684,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -71700,7 +71700,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -71715,7 +71715,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -71733,7 +71733,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -71750,7 +71750,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -71768,7 +71768,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -71785,7 +71785,7 @@
         .m(2)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -71803,7 +71803,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -71821,7 +71821,7 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -71839,7 +71839,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -71858,7 +71858,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -71876,7 +71876,7 @@
           .m(2)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -71894,7 +71894,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -71913,7 +71913,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -71931,7 +71931,7 @@
         .n(4)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -71950,7 +71950,7 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -71969,7 +71969,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -71987,7 +71987,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -72007,7 +72007,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -72026,7 +72026,7 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -72045,7 +72045,7 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -72061,7 +72061,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_2X4C8__XOP_LD128, qmax) {
@@ -72075,7 +72075,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_2X4C8__XOP_LD128, strided_cm) {
@@ -72089,7 +72089,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -72105,7 +72105,7 @@
       .m(3)
       .n(4)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_3X4C8__XOP_LD128, strided_cn) {
@@ -72119,7 +72119,7 @@
       .n(4)
       .k(8)
       .cn_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_3X4C8__XOP_LD128, k_eq_8_subtile) {
@@ -72135,7 +72135,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -72152,7 +72152,7 @@
         .n(4)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -72168,7 +72168,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -72183,7 +72183,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -72201,7 +72201,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -72218,7 +72218,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -72236,7 +72236,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -72253,7 +72253,7 @@
         .m(3)
         .n(4)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -72271,7 +72271,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -72289,7 +72289,7 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -72307,7 +72307,7 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -72326,7 +72326,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -72344,7 +72344,7 @@
           .m(3)
           .n(4)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -72362,7 +72362,7 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -72381,7 +72381,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -72399,7 +72399,7 @@
         .n(4)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -72418,7 +72418,7 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -72437,7 +72437,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -72455,7 +72455,7 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -72475,7 +72475,7 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -72494,7 +72494,7 @@
         .k(k)
         .ks(3)
         .a_offset(127)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -72513,7 +72513,7 @@
           .ks(3)
           .a_offset(127)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -72529,7 +72529,7 @@
       .n(4)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_3X4C8__XOP_LD128, qmax) {
@@ -72543,7 +72543,7 @@
       .n(4)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_3X4C8__XOP_LD128, strided_cm) {
@@ -72557,7 +72557,7 @@
       .n(4)
       .k(8)
       .cm_stride(7)
-      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld128, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -72573,7 +72573,7 @@
       .m(1)
       .n(8)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_1X8C8__AVX2, strided_cn) {
@@ -72587,7 +72587,7 @@
       .n(8)
       .k(8)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_1X8C8__AVX2, k_eq_8_subtile) {
@@ -72603,7 +72603,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -72620,7 +72620,7 @@
         .n(8)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -72636,7 +72636,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -72651,7 +72651,7 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -72669,7 +72669,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
         }
       }
     }
@@ -72686,7 +72686,7 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -72704,7 +72704,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
         }
       }
     }
@@ -72721,7 +72721,7 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -72739,7 +72739,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
         }
       }
     }
@@ -72757,7 +72757,7 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -72775,7 +72775,7 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -72794,7 +72794,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
         }
       }
     }
@@ -72812,7 +72812,7 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -72830,7 +72830,7 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -72849,7 +72849,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
         }
       }
     }
@@ -72867,7 +72867,7 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -72886,7 +72886,7 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
         }
       }
     }
@@ -72905,7 +72905,7 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -72923,7 +72923,7 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -72943,7 +72943,7 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
         }
       }
     }
@@ -72962,7 +72962,7 @@
         .k(k)
         .ks(3)
         .a_offset(43)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -72981,7 +72981,7 @@
           .ks(3)
           .a_offset(43)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -72997,7 +72997,7 @@
       .n(8)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_1X8C8__AVX2, qmax) {
@@ -73011,7 +73011,7 @@
       .n(8)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_1X8C8__AVX2, strided_cm) {
@@ -73025,7 +73025,7 @@
       .n(8)
       .k(8)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -73041,7 +73041,7 @@
       .m(2)
       .n(8)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_2X8C8__AVX2, strided_cn) {
@@ -73055,7 +73055,7 @@
       .n(8)
       .k(8)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_2X8C8__AVX2, k_eq_8_subtile) {
@@ -73071,7 +73071,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -73088,7 +73088,7 @@
         .n(8)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -73104,7 +73104,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -73119,7 +73119,7 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -73137,7 +73137,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
         }
       }
     }
@@ -73154,7 +73154,7 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -73172,7 +73172,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
         }
       }
     }
@@ -73189,7 +73189,7 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -73207,7 +73207,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
         }
       }
     }
@@ -73225,7 +73225,7 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -73243,7 +73243,7 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -73262,7 +73262,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
         }
       }
     }
@@ -73280,7 +73280,7 @@
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -73298,7 +73298,7 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -73317,7 +73317,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
         }
       }
     }
@@ -73335,7 +73335,7 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -73354,7 +73354,7 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
         }
       }
     }
@@ -73373,7 +73373,7 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -73391,7 +73391,7 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -73411,7 +73411,7 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
         }
       }
     }
@@ -73430,7 +73430,7 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -73449,7 +73449,7 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -73465,7 +73465,7 @@
       .n(8)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_2X8C8__AVX2, qmax) {
@@ -73479,7 +73479,7 @@
       .n(8)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_2X8C8__AVX2, strided_cm) {
@@ -73493,7 +73493,7 @@
       .n(8)
       .k(8)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -73509,7 +73509,7 @@
       .m(3)
       .n(8)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_3X8C8__AVX2, strided_cn) {
@@ -73523,7 +73523,7 @@
       .n(8)
       .k(8)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_3X8C8__AVX2, k_eq_8_subtile) {
@@ -73539,7 +73539,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -73556,7 +73556,7 @@
         .n(8)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -73572,7 +73572,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -73587,7 +73587,7 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -73605,7 +73605,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
         }
       }
     }
@@ -73622,7 +73622,7 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -73640,7 +73640,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
         }
       }
     }
@@ -73657,7 +73657,7 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -73675,7 +73675,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
         }
       }
     }
@@ -73693,7 +73693,7 @@
           .m(3)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -73711,7 +73711,7 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -73730,7 +73730,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
         }
       }
     }
@@ -73748,7 +73748,7 @@
           .m(3)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -73766,7 +73766,7 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -73785,7 +73785,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
         }
       }
     }
@@ -73803,7 +73803,7 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -73822,7 +73822,7 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
         }
       }
     }
@@ -73841,7 +73841,7 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -73859,7 +73859,7 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -73879,7 +73879,7 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
         }
       }
     }
@@ -73898,7 +73898,7 @@
         .k(k)
         .ks(3)
         .a_offset(127)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
     }
   }
 
@@ -73917,7 +73917,7 @@
           .ks(3)
           .a_offset(127)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
       }
     }
   }
@@ -73933,7 +73933,7 @@
       .n(8)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_3X8C8__AVX2, qmax) {
@@ -73947,7 +73947,7 @@
       .n(8)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_3X8C8__AVX2, strided_cm) {
@@ -73961,7 +73961,7 @@
       .n(8)
       .k(8)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2, xnn_init_qs8_gemm_avx2_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -73977,7 +73977,7 @@
       .m(1)
       .n(16)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_1X16C8__AVX512SKX, strided_cn) {
@@ -73991,7 +73991,7 @@
       .n(16)
       .k(8)
       .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_1X16C8__AVX512SKX, k_eq_8_subtile) {
@@ -74007,7 +74007,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -74024,7 +74024,7 @@
         .n(16)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -74040,7 +74040,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -74055,7 +74055,7 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -74073,7 +74073,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -74090,7 +74090,7 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -74108,7 +74108,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -74125,7 +74125,7 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -74143,7 +74143,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -74161,7 +74161,7 @@
           .m(1)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -74179,7 +74179,7 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -74198,7 +74198,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -74216,7 +74216,7 @@
           .m(1)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -74234,7 +74234,7 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -74253,7 +74253,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -74271,7 +74271,7 @@
         .n(16)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -74290,7 +74290,7 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -74309,7 +74309,7 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -74327,7 +74327,7 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -74347,7 +74347,7 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -74366,7 +74366,7 @@
         .k(k)
         .ks(3)
         .a_offset(43)
-        .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -74385,7 +74385,7 @@
           .ks(3)
           .a_offset(43)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -74401,7 +74401,7 @@
       .n(16)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_1X16C8__AVX512SKX, qmax) {
@@ -74415,7 +74415,7 @@
       .n(16)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_1X16C8__AVX512SKX, strided_cm) {
@@ -74429,7 +74429,7 @@
       .n(16)
       .k(8)
       .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_1x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -74445,7 +74445,7 @@
       .m(2)
       .n(16)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_2X16C8__AVX512SKX, strided_cn) {
@@ -74459,7 +74459,7 @@
       .n(16)
       .k(8)
       .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_2X16C8__AVX512SKX, k_eq_8_subtile) {
@@ -74475,7 +74475,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -74492,7 +74492,7 @@
         .n(16)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -74508,7 +74508,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -74523,7 +74523,7 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -74541,7 +74541,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -74558,7 +74558,7 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -74576,7 +74576,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -74593,7 +74593,7 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -74611,7 +74611,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -74629,7 +74629,7 @@
           .m(2)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -74647,7 +74647,7 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -74666,7 +74666,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -74684,7 +74684,7 @@
           .m(2)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -74702,7 +74702,7 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -74721,7 +74721,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -74739,7 +74739,7 @@
         .n(16)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -74758,7 +74758,7 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -74777,7 +74777,7 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -74795,7 +74795,7 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -74815,7 +74815,7 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -74834,7 +74834,7 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -74853,7 +74853,7 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -74869,7 +74869,7 @@
       .n(16)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_2X16C8__AVX512SKX, qmax) {
@@ -74883,7 +74883,7 @@
       .n(16)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_2X16C8__AVX512SKX, strided_cm) {
@@ -74897,7 +74897,7 @@
       .n(16)
       .k(8)
       .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_2x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -74913,7 +74913,7 @@
       .m(3)
       .n(16)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_3X16C8__AVX512SKX, strided_cn) {
@@ -74927,7 +74927,7 @@
       .n(16)
       .k(8)
       .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_3X16C8__AVX512SKX, k_eq_8_subtile) {
@@ -74943,7 +74943,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -74960,7 +74960,7 @@
         .n(16)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -74976,7 +74976,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -74991,7 +74991,7 @@
         .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -75009,7 +75009,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -75026,7 +75026,7 @@
         .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -75044,7 +75044,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -75061,7 +75061,7 @@
         .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -75079,7 +75079,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -75097,7 +75097,7 @@
           .m(3)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -75115,7 +75115,7 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -75134,7 +75134,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -75152,7 +75152,7 @@
           .m(3)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -75170,7 +75170,7 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -75189,7 +75189,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -75207,7 +75207,7 @@
         .n(16)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -75226,7 +75226,7 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -75245,7 +75245,7 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -75263,7 +75263,7 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -75283,7 +75283,7 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -75302,7 +75302,7 @@
         .k(k)
         .ks(3)
         .a_offset(127)
-        .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -75321,7 +75321,7 @@
           .ks(3)
           .a_offset(127)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -75337,7 +75337,7 @@
       .n(16)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_3X16C8__AVX512SKX, qmax) {
@@ -75351,7 +75351,7 @@
       .n(16)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_3X16C8__AVX512SKX, strided_cm) {
@@ -75365,7 +75365,7 @@
       .n(16)
       .k(8)
       .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_3x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
@@ -75381,7 +75381,7 @@
       .m(4)
       .n(16)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_4X16C8__AVX512SKX, strided_cn) {
@@ -75395,7 +75395,7 @@
       .n(16)
       .k(8)
       .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_4X16C8__AVX512SKX, k_eq_8_subtile) {
@@ -75411,7 +75411,7 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -75428,7 +75428,7 @@
         .n(16)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -75444,7 +75444,7 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -75459,7 +75459,7 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -75477,7 +75477,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -75494,7 +75494,7 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -75512,7 +75512,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -75529,7 +75529,7 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -75547,7 +75547,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -75565,7 +75565,7 @@
           .m(4)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -75583,7 +75583,7 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -75602,7 +75602,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -75620,7 +75620,7 @@
           .m(4)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -75638,7 +75638,7 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -75657,7 +75657,7 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -75675,7 +75675,7 @@
         .n(16)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -75694,7 +75694,7 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -75713,7 +75713,7 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -75731,7 +75731,7 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -75751,7 +75751,7 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+            .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
         }
       }
     }
@@ -75770,7 +75770,7 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+        .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
     }
   }
 
@@ -75789,7 +75789,7 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+          .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
       }
     }
   }
@@ -75805,7 +75805,7 @@
       .n(16)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_4X16C8__AVX512SKX, qmax) {
@@ -75819,7 +75819,7 @@
       .n(16)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
   }
 
   TEST(QS8_IGEMM_MINMAX_4X16C8__AVX512SKX, strided_cm) {
@@ -75833,7 +75833,7 @@
       .n(16)
       .k(8)
       .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse2_params);
+      .Test(xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx, xnn_init_qs8_gemm_sse4_params);
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
diff --git a/test/qs8-igemm-minmax.yaml b/test/qs8-igemm-minmax.yaml
index 25d87b7..eddebf6 100644
--- a/test/qs8-igemm-minmax.yaml
+++ b/test/qs8-igemm-minmax.yaml
@@ -295,28 +295,28 @@
   init: xnn_init_qs8_gemm_sse2_params
   k-block: 8
 - name: xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld64
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld64
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld64
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld64
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld64
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld64
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_igemm_minmax_ukernel_1x4c2__sse2_ld128
   init: xnn_init_qs8_gemm_sse2_params
@@ -343,40 +343,40 @@
   init: xnn_init_qs8_gemm_sse2_params
   k-block: 8
 - name: xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld128
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_igemm_minmax_ukernel_2x4c2__sse41_ld128
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_igemm_minmax_ukernel_3x4c2__sse41_ld128
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_igemm_minmax_ukernel_1x4c2__avx_ld128
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_igemm_minmax_ukernel_2x4c2__avx_ld128
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_igemm_minmax_ukernel_3x4c2__avx_ld128
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_igemm_minmax_ukernel_4x4c2__avx_ld128
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld128
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_igemm_minmax_ukernel_2x4c2__xop_ld128
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_igemm_minmax_ukernel_3x4c2__xop_ld128
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_igemm_minmax_ukernel_1x4c8__sse2_ld64
   init: xnn_init_qs8_gemm_sse2_params
@@ -397,31 +397,31 @@
   init: xnn_init_qs8_gemm_sse2_params
   k-block: 8
 - name: xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld64
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld64
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld64
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld64
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld64
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld64
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld64
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld64
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld64
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_igemm_minmax_ukernel_1x4c8__sse2_ld128
   init: xnn_init_qs8_gemm_sse2_params
@@ -442,52 +442,52 @@
   init: xnn_init_qs8_gemm_sse2_params
   k-block: 8
 - name: xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld128
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld128
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld128
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_igemm_minmax_ukernel_1x4c8__avx_ld128
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_igemm_minmax_ukernel_2x4c8__avx_ld128
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_igemm_minmax_ukernel_3x4c8__avx_ld128
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld128
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld128
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_avx2_params
   k-block: 8
 - name: xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_avx2_params
   k-block: 8
 - name: xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_avx2_params
   k-block: 8
 - name: xnn_qs8_igemm_minmax_ukernel_1x16c8__avx512skx
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_igemm_minmax_ukernel_2x16c8__avx512skx
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_igemm_minmax_ukernel_3x16c8__avx512skx
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx
-  init: xnn_init_qs8_gemm_sse2_params
+  init: xnn_init_qs8_gemm_sse4_params
   k-block: 8
 - name: xnn_qs8_igemm_minmax_ukernel_1x4c8__wasmsimd_ld64
   init: xnn_init_qs8_gemm_wasmsimd_params