QC8 GEMM/IGEMM microkernels for SSE/AVX/XOP

PiperOrigin-RevId: 377345679
diff --git a/BUILD.bazel b/BUILD.bazel
index c31bf48..54fd96e 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -2624,6 +2624,34 @@
     "src/math/sigmoid-sse2-rr2-p5-div.c",
     "src/math/sigmoid-sse2-rr2-p5-nr1.c",
     "src/math/sigmoid-sse2-rr2-p5-nr2.c",
+    "src/qc8-gemm/gen/1x4c2-minmax-fp32-sse2-ld128.c",
+    "src/qc8-gemm/gen/1x4c2-minmax-fp32-sse2-ld64.c",
+    "src/qc8-gemm/gen/1x4c8-minmax-fp32-sse2-ld128.c",
+    "src/qc8-gemm/gen/1x4c8-minmax-fp32-sse2-ld64.c",
+    "src/qc8-gemm/gen/2x4c2-minmax-fp32-sse2-ld128.c",
+    "src/qc8-gemm/gen/2x4c2-minmax-fp32-sse2-ld64.c",
+    "src/qc8-gemm/gen/2x4c8-minmax-fp32-sse2-ld128.c",
+    "src/qc8-gemm/gen/2x4c8-minmax-fp32-sse2-ld64.c",
+    "src/qc8-gemm/gen/3x4c2-minmax-fp32-sse2-ld128.c",
+    "src/qc8-gemm/gen/3x4c2-minmax-fp32-sse2-ld64.c",
+    "src/qc8-gemm/gen/3x4c8-minmax-fp32-sse2-ld128.c",
+    "src/qc8-gemm/gen/3x4c8-minmax-fp32-sse2-ld64.c",
+    "src/qc8-gemm/gen/4x4c2-minmax-fp32-sse2-ld128.c",
+    "src/qc8-gemm/gen/4x4c2-minmax-fp32-sse2-ld64.c",
+    "src/qc8-igemm/gen/1x4c2-minmax-fp32-sse2-ld128.c",
+    "src/qc8-igemm/gen/1x4c2-minmax-fp32-sse2-ld64.c",
+    "src/qc8-igemm/gen/1x4c8-minmax-fp32-sse2-ld128.c",
+    "src/qc8-igemm/gen/1x4c8-minmax-fp32-sse2-ld64.c",
+    "src/qc8-igemm/gen/2x4c2-minmax-fp32-sse2-ld128.c",
+    "src/qc8-igemm/gen/2x4c2-minmax-fp32-sse2-ld64.c",
+    "src/qc8-igemm/gen/2x4c8-minmax-fp32-sse2-ld128.c",
+    "src/qc8-igemm/gen/2x4c8-minmax-fp32-sse2-ld64.c",
+    "src/qc8-igemm/gen/3x4c2-minmax-fp32-sse2-ld128.c",
+    "src/qc8-igemm/gen/3x4c2-minmax-fp32-sse2-ld64.c",
+    "src/qc8-igemm/gen/3x4c8-minmax-fp32-sse2-ld128.c",
+    "src/qc8-igemm/gen/3x4c8-minmax-fp32-sse2-ld64.c",
+    "src/qc8-igemm/gen/4x4c2-minmax-fp32-sse2-ld128.c",
+    "src/qc8-igemm/gen/4x4c2-minmax-fp32-sse2-ld64.c",
     "src/qs8-dwconv/gen/up16x25-minmax-fp32-sse2-mul16.c",
     "src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-sse2-mul16.c",
     "src/qs8-dwconv/gen/up16x9-minmax-fp32-sse2-mul16.c",
@@ -2859,6 +2887,34 @@
     "src/math/roundne-sse41.c",
     "src/math/roundu-sse41.c",
     "src/math/roundz-sse41.c",
+    "src/qc8-gemm/gen/1x4c2-minmax-fp32-sse41-ld128.c",
+    "src/qc8-gemm/gen/1x4c2-minmax-fp32-sse41-ld64.c",
+    "src/qc8-gemm/gen/1x4c8-minmax-fp32-sse41-ld128.c",
+    "src/qc8-gemm/gen/1x4c8-minmax-fp32-sse41-ld64.c",
+    "src/qc8-gemm/gen/2x4c2-minmax-fp32-sse41-ld128.c",
+    "src/qc8-gemm/gen/2x4c2-minmax-fp32-sse41-ld64.c",
+    "src/qc8-gemm/gen/2x4c8-minmax-fp32-sse41-ld128.c",
+    "src/qc8-gemm/gen/2x4c8-minmax-fp32-sse41-ld64.c",
+    "src/qc8-gemm/gen/3x4c2-minmax-fp32-sse41-ld128.c",
+    "src/qc8-gemm/gen/3x4c2-minmax-fp32-sse41-ld64.c",
+    "src/qc8-gemm/gen/3x4c8-minmax-fp32-sse41-ld128.c",
+    "src/qc8-gemm/gen/3x4c8-minmax-fp32-sse41-ld64.c",
+    "src/qc8-gemm/gen/4x4c2-minmax-fp32-sse41-ld128.c",
+    "src/qc8-gemm/gen/4x4c2-minmax-fp32-sse41-ld64.c",
+    "src/qc8-igemm/gen/1x4c2-minmax-fp32-sse41-ld128.c",
+    "src/qc8-igemm/gen/1x4c2-minmax-fp32-sse41-ld64.c",
+    "src/qc8-igemm/gen/1x4c8-minmax-fp32-sse41-ld128.c",
+    "src/qc8-igemm/gen/1x4c8-minmax-fp32-sse41-ld64.c",
+    "src/qc8-igemm/gen/2x4c2-minmax-fp32-sse41-ld128.c",
+    "src/qc8-igemm/gen/2x4c2-minmax-fp32-sse41-ld64.c",
+    "src/qc8-igemm/gen/2x4c8-minmax-fp32-sse41-ld128.c",
+    "src/qc8-igemm/gen/2x4c8-minmax-fp32-sse41-ld64.c",
+    "src/qc8-igemm/gen/3x4c2-minmax-fp32-sse41-ld128.c",
+    "src/qc8-igemm/gen/3x4c2-minmax-fp32-sse41-ld64.c",
+    "src/qc8-igemm/gen/3x4c8-minmax-fp32-sse41-ld128.c",
+    "src/qc8-igemm/gen/3x4c8-minmax-fp32-sse41-ld64.c",
+    "src/qc8-igemm/gen/4x4c2-minmax-fp32-sse41-ld128.c",
+    "src/qc8-igemm/gen/4x4c2-minmax-fp32-sse41-ld64.c",
     "src/qs8-dwconv/gen/up16x25-minmax-fp32-sse41-mul16.c",
     "src/qs8-dwconv/gen/up16x25-minmax-fp32-sse41-mul32.c",
     "src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-sse41-mul16.c",
@@ -3122,6 +3178,34 @@
     "src/math/sigmoid-avx-rr2-p5-div.c",
     "src/math/sigmoid-avx-rr2-p5-nr1.c",
     "src/math/sigmoid-avx-rr2-p5-nr2.c",
+    "src/qc8-gemm/gen/1x4c2-minmax-fp32-avx-ld128.c",
+    "src/qc8-gemm/gen/1x4c2-minmax-fp32-avx-ld64.c",
+    "src/qc8-gemm/gen/1x4c8-minmax-fp32-avx-ld128.c",
+    "src/qc8-gemm/gen/1x4c8-minmax-fp32-avx-ld64.c",
+    "src/qc8-gemm/gen/2x4c2-minmax-fp32-avx-ld128.c",
+    "src/qc8-gemm/gen/2x4c2-minmax-fp32-avx-ld64.c",
+    "src/qc8-gemm/gen/2x4c8-minmax-fp32-avx-ld128.c",
+    "src/qc8-gemm/gen/2x4c8-minmax-fp32-avx-ld64.c",
+    "src/qc8-gemm/gen/3x4c2-minmax-fp32-avx-ld128.c",
+    "src/qc8-gemm/gen/3x4c2-minmax-fp32-avx-ld64.c",
+    "src/qc8-gemm/gen/3x4c8-minmax-fp32-avx-ld128.c",
+    "src/qc8-gemm/gen/3x4c8-minmax-fp32-avx-ld64.c",
+    "src/qc8-gemm/gen/4x4c2-minmax-fp32-avx-ld128.c",
+    "src/qc8-gemm/gen/4x4c2-minmax-fp32-avx-ld64.c",
+    "src/qc8-igemm/gen/1x4c2-minmax-fp32-avx-ld128.c",
+    "src/qc8-igemm/gen/1x4c2-minmax-fp32-avx-ld64.c",
+    "src/qc8-igemm/gen/1x4c8-minmax-fp32-avx-ld128.c",
+    "src/qc8-igemm/gen/1x4c8-minmax-fp32-avx-ld64.c",
+    "src/qc8-igemm/gen/2x4c2-minmax-fp32-avx-ld128.c",
+    "src/qc8-igemm/gen/2x4c2-minmax-fp32-avx-ld64.c",
+    "src/qc8-igemm/gen/2x4c8-minmax-fp32-avx-ld128.c",
+    "src/qc8-igemm/gen/2x4c8-minmax-fp32-avx-ld64.c",
+    "src/qc8-igemm/gen/3x4c2-minmax-fp32-avx-ld128.c",
+    "src/qc8-igemm/gen/3x4c2-minmax-fp32-avx-ld64.c",
+    "src/qc8-igemm/gen/3x4c8-minmax-fp32-avx-ld128.c",
+    "src/qc8-igemm/gen/3x4c8-minmax-fp32-avx-ld64.c",
+    "src/qc8-igemm/gen/4x4c2-minmax-fp32-avx-ld128.c",
+    "src/qc8-igemm/gen/4x4c2-minmax-fp32-avx-ld64.c",
     "src/qs8-dwconv/gen/up16x25-minmax-fp32-avx-mul16.c",
     "src/qs8-dwconv/gen/up16x25-minmax-fp32-avx-mul32.c",
     "src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-avx-mul16.c",
@@ -3228,6 +3312,34 @@
 ]
 
 XOP_UKERNELS = [
+    "src/qc8-gemm/gen/1x4c2-minmax-fp32-xop-ld128.c",
+    "src/qc8-gemm/gen/1x4c2-minmax-fp32-xop-ld64.c",
+    "src/qc8-gemm/gen/1x4c8-minmax-fp32-xop-ld128.c",
+    "src/qc8-gemm/gen/1x4c8-minmax-fp32-xop-ld64.c",
+    "src/qc8-gemm/gen/2x4c2-minmax-fp32-xop-ld128.c",
+    "src/qc8-gemm/gen/2x4c2-minmax-fp32-xop-ld64.c",
+    "src/qc8-gemm/gen/2x4c8-minmax-fp32-xop-ld128.c",
+    "src/qc8-gemm/gen/2x4c8-minmax-fp32-xop-ld64.c",
+    "src/qc8-gemm/gen/3x4c2-minmax-fp32-xop-ld128.c",
+    "src/qc8-gemm/gen/3x4c2-minmax-fp32-xop-ld64.c",
+    "src/qc8-gemm/gen/3x4c8-minmax-fp32-xop-ld128.c",
+    "src/qc8-gemm/gen/3x4c8-minmax-fp32-xop-ld64.c",
+    "src/qc8-gemm/gen/4x4c2-minmax-fp32-xop-ld128.c",
+    "src/qc8-gemm/gen/4x4c2-minmax-fp32-xop-ld64.c",
+    "src/qc8-igemm/gen/1x4c2-minmax-fp32-xop-ld128.c",
+    "src/qc8-igemm/gen/1x4c2-minmax-fp32-xop-ld64.c",
+    "src/qc8-igemm/gen/1x4c8-minmax-fp32-xop-ld128.c",
+    "src/qc8-igemm/gen/1x4c8-minmax-fp32-xop-ld64.c",
+    "src/qc8-igemm/gen/2x4c2-minmax-fp32-xop-ld128.c",
+    "src/qc8-igemm/gen/2x4c2-minmax-fp32-xop-ld64.c",
+    "src/qc8-igemm/gen/2x4c8-minmax-fp32-xop-ld128.c",
+    "src/qc8-igemm/gen/2x4c8-minmax-fp32-xop-ld64.c",
+    "src/qc8-igemm/gen/3x4c2-minmax-fp32-xop-ld128.c",
+    "src/qc8-igemm/gen/3x4c2-minmax-fp32-xop-ld64.c",
+    "src/qc8-igemm/gen/3x4c8-minmax-fp32-xop-ld128.c",
+    "src/qc8-igemm/gen/3x4c8-minmax-fp32-xop-ld64.c",
+    "src/qc8-igemm/gen/4x4c2-minmax-fp32-xop-ld128.c",
+    "src/qc8-igemm/gen/4x4c2-minmax-fp32-xop-ld64.c",
     "src/qs8-dwconv/gen/up16x25-minmax-fp32-xop-mul32.c",
     "src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-xop-mul32.c",
     "src/qs8-dwconv/gen/up16x9-minmax-fp32-xop-mul32.c",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c84da62..9ce36a3 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1864,6 +1864,34 @@
   src/math/sigmoid-sse2-rr2-p5-div.c
   src/math/sigmoid-sse2-rr2-p5-nr1.c
   src/math/sigmoid-sse2-rr2-p5-nr2.c
+  src/qc8-gemm/gen/1x4c2-minmax-fp32-sse2-ld128.c
+  src/qc8-gemm/gen/1x4c2-minmax-fp32-sse2-ld64.c
+  src/qc8-gemm/gen/1x4c8-minmax-fp32-sse2-ld64.c
+  src/qc8-gemm/gen/1x4c8-minmax-fp32-sse2-ld128.c
+  src/qc8-gemm/gen/2x4c2-minmax-fp32-sse2-ld64.c
+  src/qc8-gemm/gen/2x4c2-minmax-fp32-sse2-ld128.c
+  src/qc8-gemm/gen/2x4c8-minmax-fp32-sse2-ld64.c
+  src/qc8-gemm/gen/2x4c8-minmax-fp32-sse2-ld128.c
+  src/qc8-gemm/gen/3x4c2-minmax-fp32-sse2-ld64.c
+  src/qc8-gemm/gen/3x4c2-minmax-fp32-sse2-ld128.c
+  src/qc8-gemm/gen/3x4c8-minmax-fp32-sse2-ld64.c
+  src/qc8-gemm/gen/3x4c8-minmax-fp32-sse2-ld128.c
+  src/qc8-gemm/gen/4x4c2-minmax-fp32-sse2-ld64.c
+  src/qc8-gemm/gen/4x4c2-minmax-fp32-sse2-ld128.c
+  src/qc8-igemm/gen/1x4c2-minmax-fp32-sse2-ld128.c
+  src/qc8-igemm/gen/1x4c2-minmax-fp32-sse2-ld64.c
+  src/qc8-igemm/gen/1x4c8-minmax-fp32-sse2-ld128.c
+  src/qc8-igemm/gen/1x4c8-minmax-fp32-sse2-ld64.c
+  src/qc8-igemm/gen/2x4c2-minmax-fp32-sse2-ld128.c
+  src/qc8-igemm/gen/2x4c2-minmax-fp32-sse2-ld64.c
+  src/qc8-igemm/gen/2x4c8-minmax-fp32-sse2-ld128.c
+  src/qc8-igemm/gen/2x4c8-minmax-fp32-sse2-ld64.c
+  src/qc8-igemm/gen/3x4c2-minmax-fp32-sse2-ld128.c
+  src/qc8-igemm/gen/3x4c2-minmax-fp32-sse2-ld64.c
+  src/qc8-igemm/gen/3x4c8-minmax-fp32-sse2-ld128.c
+  src/qc8-igemm/gen/3x4c8-minmax-fp32-sse2-ld64.c
+  src/qc8-igemm/gen/4x4c2-minmax-fp32-sse2-ld128.c
+  src/qc8-igemm/gen/4x4c2-minmax-fp32-sse2-ld64.c
   src/qs8-dwconv/gen/up16x25-minmax-fp32-sse2-mul16.c
   src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-sse2-mul16.c
   src/qs8-dwconv/gen/up16x9-minmax-fp32-sse2-mul16.c
@@ -2097,6 +2125,34 @@
   src/math/roundne-sse41.c
   src/math/roundu-sse41.c
   src/math/roundz-sse41.c
+  src/qc8-gemm/gen/1x4c2-minmax-fp32-sse41-ld128.c
+  src/qc8-gemm/gen/1x4c2-minmax-fp32-sse41-ld64.c
+  src/qc8-gemm/gen/1x4c8-minmax-fp32-sse41-ld128.c
+  src/qc8-gemm/gen/1x4c8-minmax-fp32-sse41-ld64.c
+  src/qc8-gemm/gen/2x4c2-minmax-fp32-sse41-ld128.c
+  src/qc8-gemm/gen/2x4c2-minmax-fp32-sse41-ld64.c
+  src/qc8-gemm/gen/2x4c8-minmax-fp32-sse41-ld128.c
+  src/qc8-gemm/gen/2x4c8-minmax-fp32-sse41-ld64.c
+  src/qc8-gemm/gen/3x4c2-minmax-fp32-sse41-ld128.c
+  src/qc8-gemm/gen/3x4c2-minmax-fp32-sse41-ld64.c
+  src/qc8-gemm/gen/3x4c8-minmax-fp32-sse41-ld128.c
+  src/qc8-gemm/gen/3x4c8-minmax-fp32-sse41-ld64.c
+  src/qc8-gemm/gen/4x4c2-minmax-fp32-sse41-ld128.c
+  src/qc8-gemm/gen/4x4c2-minmax-fp32-sse41-ld64.c
+  src/qc8-igemm/gen/1x4c2-minmax-fp32-sse41-ld128.c
+  src/qc8-igemm/gen/1x4c2-minmax-fp32-sse41-ld64.c
+  src/qc8-igemm/gen/1x4c8-minmax-fp32-sse41-ld128.c
+  src/qc8-igemm/gen/1x4c8-minmax-fp32-sse41-ld64.c
+  src/qc8-igemm/gen/2x4c2-minmax-fp32-sse41-ld128.c
+  src/qc8-igemm/gen/2x4c2-minmax-fp32-sse41-ld64.c
+  src/qc8-igemm/gen/2x4c8-minmax-fp32-sse41-ld128.c
+  src/qc8-igemm/gen/2x4c8-minmax-fp32-sse41-ld64.c
+  src/qc8-igemm/gen/3x4c2-minmax-fp32-sse41-ld128.c
+  src/qc8-igemm/gen/3x4c2-minmax-fp32-sse41-ld64.c
+  src/qc8-igemm/gen/3x4c8-minmax-fp32-sse41-ld128.c
+  src/qc8-igemm/gen/3x4c8-minmax-fp32-sse41-ld64.c
+  src/qc8-igemm/gen/4x4c2-minmax-fp32-sse41-ld128.c
+  src/qc8-igemm/gen/4x4c2-minmax-fp32-sse41-ld64.c
   src/qs8-dwconv/gen/up16x25-minmax-fp32-sse41-mul16.c
   src/qs8-dwconv/gen/up16x25-minmax-fp32-sse41-mul32.c
   src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-sse41-mul16.c
@@ -2357,6 +2413,34 @@
   src/math/sigmoid-avx-rr2-p5-div.c
   src/math/sigmoid-avx-rr2-p5-nr1.c
   src/math/sigmoid-avx-rr2-p5-nr2.c
+  src/qc8-gemm/gen/1x4c2-minmax-fp32-avx-ld128.c
+  src/qc8-gemm/gen/1x4c2-minmax-fp32-avx-ld64.c
+  src/qc8-gemm/gen/1x4c8-minmax-fp32-avx-ld128.c
+  src/qc8-gemm/gen/1x4c8-minmax-fp32-avx-ld64.c
+  src/qc8-gemm/gen/2x4c2-minmax-fp32-avx-ld128.c
+  src/qc8-gemm/gen/2x4c2-minmax-fp32-avx-ld64.c
+  src/qc8-gemm/gen/2x4c8-minmax-fp32-avx-ld128.c
+  src/qc8-gemm/gen/2x4c8-minmax-fp32-avx-ld64.c
+  src/qc8-gemm/gen/3x4c2-minmax-fp32-avx-ld128.c
+  src/qc8-gemm/gen/3x4c2-minmax-fp32-avx-ld64.c
+  src/qc8-gemm/gen/3x4c8-minmax-fp32-avx-ld128.c
+  src/qc8-gemm/gen/3x4c8-minmax-fp32-avx-ld64.c
+  src/qc8-gemm/gen/4x4c2-minmax-fp32-avx-ld128.c
+  src/qc8-gemm/gen/4x4c2-minmax-fp32-avx-ld64.c
+  src/qc8-igemm/gen/1x4c2-minmax-fp32-avx-ld128.c
+  src/qc8-igemm/gen/1x4c2-minmax-fp32-avx-ld64.c
+  src/qc8-igemm/gen/1x4c8-minmax-fp32-avx-ld128.c
+  src/qc8-igemm/gen/1x4c8-minmax-fp32-avx-ld64.c
+  src/qc8-igemm/gen/2x4c2-minmax-fp32-avx-ld128.c
+  src/qc8-igemm/gen/2x4c2-minmax-fp32-avx-ld64.c
+  src/qc8-igemm/gen/2x4c8-minmax-fp32-avx-ld128.c
+  src/qc8-igemm/gen/2x4c8-minmax-fp32-avx-ld64.c
+  src/qc8-igemm/gen/3x4c2-minmax-fp32-avx-ld128.c
+  src/qc8-igemm/gen/3x4c2-minmax-fp32-avx-ld64.c
+  src/qc8-igemm/gen/3x4c8-minmax-fp32-avx-ld128.c
+  src/qc8-igemm/gen/3x4c8-minmax-fp32-avx-ld64.c
+  src/qc8-igemm/gen/4x4c2-minmax-fp32-avx-ld128.c
+  src/qc8-igemm/gen/4x4c2-minmax-fp32-avx-ld64.c
   src/qs8-dwconv/gen/up16x25-minmax-fp32-avx-mul16.c
   src/qs8-dwconv/gen/up16x25-minmax-fp32-avx-mul32.c
   src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-avx-mul16.c
@@ -2462,6 +2546,34 @@
   src/qs8-vaddc/gen/minmax-avx-mul32-ld32-x32.c)
 
 SET(XNNPACK_XOP_MICROKERNEL_SRCS
+  src/qc8-gemm/gen/1x4c2-minmax-fp32-xop-ld128.c
+  src/qc8-gemm/gen/1x4c2-minmax-fp32-xop-ld64.c
+  src/qc8-gemm/gen/1x4c8-minmax-fp32-xop-ld128.c
+  src/qc8-gemm/gen/1x4c8-minmax-fp32-xop-ld64.c
+  src/qc8-gemm/gen/2x4c2-minmax-fp32-xop-ld128.c
+  src/qc8-gemm/gen/2x4c2-minmax-fp32-xop-ld64.c
+  src/qc8-gemm/gen/2x4c8-minmax-fp32-xop-ld128.c
+  src/qc8-gemm/gen/2x4c8-minmax-fp32-xop-ld64.c
+  src/qc8-gemm/gen/3x4c2-minmax-fp32-xop-ld128.c
+  src/qc8-gemm/gen/3x4c2-minmax-fp32-xop-ld64.c
+  src/qc8-gemm/gen/3x4c8-minmax-fp32-xop-ld128.c
+  src/qc8-gemm/gen/3x4c8-minmax-fp32-xop-ld64.c
+  src/qc8-gemm/gen/4x4c2-minmax-fp32-xop-ld128.c
+  src/qc8-gemm/gen/4x4c2-minmax-fp32-xop-ld64.c
+  src/qc8-igemm/gen/1x4c2-minmax-fp32-xop-ld128.c
+  src/qc8-igemm/gen/1x4c2-minmax-fp32-xop-ld64.c
+  src/qc8-igemm/gen/1x4c8-minmax-fp32-xop-ld128.c
+  src/qc8-igemm/gen/1x4c8-minmax-fp32-xop-ld64.c
+  src/qc8-igemm/gen/2x4c2-minmax-fp32-xop-ld128.c
+  src/qc8-igemm/gen/2x4c2-minmax-fp32-xop-ld64.c
+  src/qc8-igemm/gen/2x4c8-minmax-fp32-xop-ld128.c
+  src/qc8-igemm/gen/2x4c8-minmax-fp32-xop-ld64.c
+  src/qc8-igemm/gen/3x4c2-minmax-fp32-xop-ld128.c
+  src/qc8-igemm/gen/3x4c2-minmax-fp32-xop-ld64.c
+  src/qc8-igemm/gen/3x4c8-minmax-fp32-xop-ld128.c
+  src/qc8-igemm/gen/3x4c8-minmax-fp32-xop-ld64.c
+  src/qc8-igemm/gen/4x4c2-minmax-fp32-xop-ld128.c
+  src/qc8-igemm/gen/4x4c2-minmax-fp32-xop-ld64.c
   src/qs8-dwconv/gen/up16x25-minmax-fp32-xop-mul32.c
   src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-xop-mul32.c
   src/qs8-dwconv/gen/up16x9-minmax-fp32-xop-mul32.c
diff --git a/scripts/generate-qs8-gemm.sh b/scripts/generate-qs8-gemm.sh
index de33876..3680392 100755
--- a/scripts/generate-qs8-gemm.sh
+++ b/scripts/generate-qs8-gemm.sh
@@ -154,221 +154,293 @@
 
 ################################### x86 SSE ###################################
 ### C2 micro-kernels
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=1 -D SSE=2 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64     -o src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-sse2-ld64.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=2 -D SSE=2 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64     -o src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-sse2-ld64.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=3 -D SSE=2 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64     -o src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-sse2-ld64.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=4 -D SSE=2 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64     -o src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-sse2-ld64.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=1 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64     -o src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-sse2-ld64.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=2 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64     -o src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-sse2-ld64.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=3 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64     -o src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-sse2-ld64.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=4 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64     -o src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-sse2-ld64.c
 
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=1 -D SSE=2 -D AVX=0 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qs8-gemm/gen/1x4c2-minmax-fp32-sse2-ld64.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=2 -D SSE=2 -D AVX=0 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qs8-gemm/gen/2x4c2-minmax-fp32-sse2-ld64.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=3 -D SSE=2 -D AVX=0 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qs8-gemm/gen/3x4c2-minmax-fp32-sse2-ld64.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=4 -D SSE=2 -D AVX=0 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qs8-gemm/gen/4x4c2-minmax-fp32-sse2-ld64.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=1 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qs8-gemm/gen/1x4c2-minmax-fp32-sse2-ld64.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=2 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qs8-gemm/gen/2x4c2-minmax-fp32-sse2-ld64.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=3 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qs8-gemm/gen/3x4c2-minmax-fp32-sse2-ld64.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=4 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qs8-gemm/gen/4x4c2-minmax-fp32-sse2-ld64.c
 
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=1 -D SSE=3 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64     -o src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-ssse3-ld64.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=2 -D SSE=3 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64     -o src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-ssse3-ld64.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=3 -D SSE=3 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64     -o src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-ssse3-ld64.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=4 -D SSE=3 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64     -o src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-ssse3-ld64.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=1 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qc8-gemm/gen/1x4c2-minmax-fp32-sse2-ld64.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=2 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qc8-gemm/gen/2x4c2-minmax-fp32-sse2-ld64.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=3 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qc8-gemm/gen/3x4c2-minmax-fp32-sse2-ld64.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=4 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qc8-gemm/gen/4x4c2-minmax-fp32-sse2-ld64.c
 
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64     -o src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-sse41-ld64.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64     -o src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-sse41-ld64.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64     -o src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-sse41-ld64.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64     -o src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-sse41-ld64.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=1 -D SSE=3 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64     -o src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-ssse3-ld64.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=2 -D SSE=3 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64     -o src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-ssse3-ld64.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=3 -D SSE=3 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64     -o src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-ssse3-ld64.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=4 -D SSE=3 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64     -o src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-ssse3-ld64.c
 
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=0 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qs8-gemm/gen/1x4c2-minmax-fp32-sse41-ld64.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=0 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qs8-gemm/gen/2x4c2-minmax-fp32-sse41-ld64.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=0 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qs8-gemm/gen/3x4c2-minmax-fp32-sse41-ld64.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=0 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qs8-gemm/gen/4x4c2-minmax-fp32-sse41-ld64.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64     -o src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-sse41-ld64.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64     -o src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-sse41-ld64.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64     -o src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-sse41-ld64.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64     -o src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-sse41-ld64.c
 
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64     -o src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-avx-ld64.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64     -o src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-avx-ld64.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64     -o src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-avx-ld64.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=1 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64     -o src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-avx-ld64.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qs8-gemm/gen/1x4c2-minmax-fp32-sse41-ld64.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qs8-gemm/gen/2x4c2-minmax-fp32-sse41-ld64.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qs8-gemm/gen/3x4c2-minmax-fp32-sse41-ld64.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qs8-gemm/gen/4x4c2-minmax-fp32-sse41-ld64.c
 
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qs8-gemm/gen/1x4c2-minmax-fp32-avx-ld64.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qs8-gemm/gen/2x4c2-minmax-fp32-avx-ld64.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qs8-gemm/gen/3x4c2-minmax-fp32-avx-ld64.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=1 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qs8-gemm/gen/4x4c2-minmax-fp32-avx-ld64.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qc8-gemm/gen/1x4c2-minmax-fp32-sse41-ld64.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qc8-gemm/gen/2x4c2-minmax-fp32-sse41-ld64.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qc8-gemm/gen/3x4c2-minmax-fp32-sse41-ld64.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qc8-gemm/gen/4x4c2-minmax-fp32-sse41-ld64.c
 
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=1 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64     -o src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-xop-ld64.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=1 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64     -o src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-xop-ld64.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=1 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64     -o src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-xop-ld64.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=1 -D XOP=1 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64     -o src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-xop-ld64.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64     -o src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-avx-ld64.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64     -o src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-avx-ld64.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64     -o src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-avx-ld64.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64     -o src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-avx-ld64.c
 
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=1 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qs8-gemm/gen/1x4c2-minmax-fp32-xop-ld64.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=1 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qs8-gemm/gen/2x4c2-minmax-fp32-xop-ld64.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=1 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qs8-gemm/gen/3x4c2-minmax-fp32-xop-ld64.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=1 -D XOP=1 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qs8-gemm/gen/4x4c2-minmax-fp32-xop-ld64.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qs8-gemm/gen/1x4c2-minmax-fp32-avx-ld64.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qs8-gemm/gen/2x4c2-minmax-fp32-avx-ld64.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qs8-gemm/gen/3x4c2-minmax-fp32-avx-ld64.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qs8-gemm/gen/4x4c2-minmax-fp32-avx-ld64.c
 
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=1 -D SSE=2 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128    -o src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-sse2-ld128.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=2 -D SSE=2 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128    -o src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-sse2-ld128.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=3 -D SSE=2 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128    -o src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-sse2-ld128.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=4 -D SSE=2 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128    -o src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-sse2-ld128.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qc8-gemm/gen/1x4c2-minmax-fp32-avx-ld64.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qc8-gemm/gen/2x4c2-minmax-fp32-avx-ld64.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qc8-gemm/gen/3x4c2-minmax-fp32-avx-ld64.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qc8-gemm/gen/4x4c2-minmax-fp32-avx-ld64.c
 
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=1 -D SSE=2 -D AVX=0 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qs8-gemm/gen/1x4c2-minmax-fp32-sse2-ld128.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=2 -D SSE=2 -D AVX=0 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qs8-gemm/gen/2x4c2-minmax-fp32-sse2-ld128.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=3 -D SSE=2 -D AVX=0 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qs8-gemm/gen/3x4c2-minmax-fp32-sse2-ld128.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=4 -D SSE=2 -D AVX=0 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qs8-gemm/gen/4x4c2-minmax-fp32-sse2-ld128.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64     -o src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-xop-ld64.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64     -o src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-xop-ld64.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64     -o src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-xop-ld64.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64     -o src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-xop-ld64.c
 
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=1 -D SSE=3 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128    -o src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-ssse3-ld128.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=2 -D SSE=3 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128    -o src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-ssse3-ld128.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=3 -D SSE=3 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128    -o src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-ssse3-ld128.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=4 -D SSE=3 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128    -o src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-ssse3-ld128.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qs8-gemm/gen/1x4c2-minmax-fp32-xop-ld64.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qs8-gemm/gen/2x4c2-minmax-fp32-xop-ld64.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qs8-gemm/gen/3x4c2-minmax-fp32-xop-ld64.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qs8-gemm/gen/4x4c2-minmax-fp32-xop-ld64.c
 
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128    -o src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-sse41-ld128.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128    -o src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-sse41-ld128.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128    -o src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-sse41-ld128.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128    -o src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-sse41-ld128.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qc8-gemm/gen/1x4c2-minmax-fp32-xop-ld64.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qc8-gemm/gen/2x4c2-minmax-fp32-xop-ld64.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qc8-gemm/gen/3x4c2-minmax-fp32-xop-ld64.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qc8-gemm/gen/4x4c2-minmax-fp32-xop-ld64.c
 
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=0 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qs8-gemm/gen/1x4c2-minmax-fp32-sse41-ld128.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=0 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qs8-gemm/gen/2x4c2-minmax-fp32-sse41-ld128.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=0 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qs8-gemm/gen/3x4c2-minmax-fp32-sse41-ld128.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=0 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qs8-gemm/gen/4x4c2-minmax-fp32-sse41-ld128.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=1 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128    -o src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-sse2-ld128.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=2 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128    -o src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-sse2-ld128.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=3 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128    -o src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-sse2-ld128.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=4 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128    -o src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-sse2-ld128.c
 
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128    -o src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-avx-ld128.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128    -o src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-avx-ld128.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128    -o src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-avx-ld128.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=1 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128    -o src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-avx-ld128.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=1 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qs8-gemm/gen/1x4c2-minmax-fp32-sse2-ld128.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=2 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qs8-gemm/gen/2x4c2-minmax-fp32-sse2-ld128.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=3 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qs8-gemm/gen/3x4c2-minmax-fp32-sse2-ld128.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=4 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qs8-gemm/gen/4x4c2-minmax-fp32-sse2-ld128.c
 
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qs8-gemm/gen/1x4c2-minmax-fp32-avx-ld128.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qs8-gemm/gen/2x4c2-minmax-fp32-avx-ld128.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qs8-gemm/gen/3x4c2-minmax-fp32-avx-ld128.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=1 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qs8-gemm/gen/4x4c2-minmax-fp32-avx-ld128.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=1 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qc8-gemm/gen/1x4c2-minmax-fp32-sse2-ld128.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=2 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qc8-gemm/gen/2x4c2-minmax-fp32-sse2-ld128.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=3 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qc8-gemm/gen/3x4c2-minmax-fp32-sse2-ld128.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=4 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qc8-gemm/gen/4x4c2-minmax-fp32-sse2-ld128.c
 
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=1 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128    -o src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-xop-ld128.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=1 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128    -o src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-xop-ld128.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=1 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128    -o src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-xop-ld128.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=1 -D XOP=1 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128    -o src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-xop-ld128.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=1 -D SSE=3 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128    -o src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-ssse3-ld128.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=2 -D SSE=3 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128    -o src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-ssse3-ld128.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=3 -D SSE=3 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128    -o src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-ssse3-ld128.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=4 -D SSE=3 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128    -o src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-ssse3-ld128.c
 
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=1 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qs8-gemm/gen/1x4c2-minmax-fp32-xop-ld128.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=1 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qs8-gemm/gen/2x4c2-minmax-fp32-xop-ld128.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=1 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qs8-gemm/gen/3x4c2-minmax-fp32-xop-ld128.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=1 -D XOP=1 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qs8-gemm/gen/4x4c2-minmax-fp32-xop-ld128.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128    -o src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-sse41-ld128.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128    -o src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-sse41-ld128.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128    -o src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-sse41-ld128.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128    -o src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-sse41-ld128.c
 
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=1 -D SSE=2 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-sse2.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=2 -D SSE=2 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-sse2.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=3 -D SSE=2 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-sse2.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=4 -D SSE=2 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-sse2.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qs8-gemm/gen/1x4c2-minmax-fp32-sse41-ld128.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qs8-gemm/gen/2x4c2-minmax-fp32-sse41-ld128.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qs8-gemm/gen/3x4c2-minmax-fp32-sse41-ld128.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qs8-gemm/gen/4x4c2-minmax-fp32-sse41-ld128.c
 
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=1 -D SSE=3 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-ssse3.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=2 -D SSE=3 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-ssse3.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=3 -D SSE=3 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-ssse3.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=4 -D SSE=3 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-ssse3.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qc8-gemm/gen/1x4c2-minmax-fp32-sse41-ld128.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qc8-gemm/gen/2x4c2-minmax-fp32-sse41-ld128.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qc8-gemm/gen/3x4c2-minmax-fp32-sse41-ld128.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qc8-gemm/gen/4x4c2-minmax-fp32-sse41-ld128.c
 
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-sse41.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-sse41.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-sse41.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-sse41.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128    -o src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-avx-ld128.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128    -o src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-avx-ld128.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128    -o src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-avx-ld128.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128    -o src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-avx-ld128.c
 
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-avx.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-avx.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-avx.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=1 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-avx.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qs8-gemm/gen/1x4c2-minmax-fp32-avx-ld128.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qs8-gemm/gen/2x4c2-minmax-fp32-avx-ld128.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qs8-gemm/gen/3x4c2-minmax-fp32-avx-ld128.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qs8-gemm/gen/4x4c2-minmax-fp32-avx-ld128.c
 
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=1 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-xop.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=1 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-xop.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=1 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-xop.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=1 -D XOP=1 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-xop.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qc8-gemm/gen/1x4c2-minmax-fp32-avx-ld128.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qc8-gemm/gen/2x4c2-minmax-fp32-avx-ld128.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qc8-gemm/gen/3x4c2-minmax-fp32-avx-ld128.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qc8-gemm/gen/4x4c2-minmax-fp32-avx-ld128.c
+
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128    -o src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-xop-ld128.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128    -o src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-xop-ld128.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128    -o src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-xop-ld128.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128    -o src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-xop-ld128.c
+
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qs8-gemm/gen/1x4c2-minmax-fp32-xop-ld128.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qs8-gemm/gen/2x4c2-minmax-fp32-xop-ld128.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qs8-gemm/gen/3x4c2-minmax-fp32-xop-ld128.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qs8-gemm/gen/4x4c2-minmax-fp32-xop-ld128.c
+
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qc8-gemm/gen/1x4c2-minmax-fp32-xop-ld128.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qc8-gemm/gen/2x4c2-minmax-fp32-xop-ld128.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qc8-gemm/gen/3x4c2-minmax-fp32-xop-ld128.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qc8-gemm/gen/4x4c2-minmax-fp32-xop-ld128.c
+
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=1 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-sse2.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=2 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-sse2.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=3 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-sse2.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=4 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-sse2.c
+
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=1 -D SSE=3 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-ssse3.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=2 -D SSE=3 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-ssse3.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=3 -D SSE=3 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-ssse3.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=4 -D SSE=3 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-ssse3.c
+
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-sse41.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-sse41.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-sse41.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-sse41.c
+
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-avx.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-avx.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-avx.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-avx.c
+
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-xop.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-xop.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-xop.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-xop.c
 
 ### C8 micro-kernels
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=2 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64     -o src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-sse2-ld64.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=2 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64     -o src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-sse2-ld64.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=2 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64     -o src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-sse2-ld64.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64     -o src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-sse2-ld64.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64     -o src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-sse2-ld64.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64     -o src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-sse2-ld64.c
 
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=2 -D AVX=0 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qs8-gemm/gen/1x4c8-minmax-fp32-sse2-ld64.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=2 -D AVX=0 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qs8-gemm/gen/2x4c8-minmax-fp32-sse2-ld64.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=2 -D AVX=0 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qs8-gemm/gen/3x4c8-minmax-fp32-sse2-ld64.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qs8-gemm/gen/1x4c8-minmax-fp32-sse2-ld64.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qs8-gemm/gen/2x4c8-minmax-fp32-sse2-ld64.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qs8-gemm/gen/3x4c8-minmax-fp32-sse2-ld64.c
 
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=3 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64     -o src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-ssse3-ld64.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=3 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64     -o src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-ssse3-ld64.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=3 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64     -o src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-ssse3-ld64.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qc8-gemm/gen/1x4c8-minmax-fp32-sse2-ld64.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qc8-gemm/gen/2x4c8-minmax-fp32-sse2-ld64.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qc8-gemm/gen/3x4c8-minmax-fp32-sse2-ld64.c
 
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=3 -D AVX=0 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qs8-gemm/gen/1x4c8-minmax-fp32-ssse3-ld64.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=3 -D AVX=0 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qs8-gemm/gen/2x4c8-minmax-fp32-ssse3-ld64.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=3 -D AVX=0 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qs8-gemm/gen/3x4c8-minmax-fp32-ssse3-ld64.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=3 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64     -o src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-ssse3-ld64.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=3 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64     -o src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-ssse3-ld64.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=3 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64     -o src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-ssse3-ld64.c
 
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64     -o src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-sse41-ld64.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64     -o src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-sse41-ld64.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64     -o src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-sse41-ld64.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=3 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qs8-gemm/gen/1x4c8-minmax-fp32-ssse3-ld64.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=3 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qs8-gemm/gen/2x4c8-minmax-fp32-ssse3-ld64.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=3 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qs8-gemm/gen/3x4c8-minmax-fp32-ssse3-ld64.c
 
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=0 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qs8-gemm/gen/1x4c8-minmax-fp32-sse41-ld64.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=0 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qs8-gemm/gen/2x4c8-minmax-fp32-sse41-ld64.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=0 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qs8-gemm/gen/3x4c8-minmax-fp32-sse41-ld64.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64     -o src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-sse41-ld64.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64     -o src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-sse41-ld64.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64     -o src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-sse41-ld64.c
 
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64     -o src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-avx-ld64.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64     -o src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-avx-ld64.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64     -o src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-avx-ld64.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qs8-gemm/gen/1x4c8-minmax-fp32-sse41-ld64.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qs8-gemm/gen/2x4c8-minmax-fp32-sse41-ld64.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qs8-gemm/gen/3x4c8-minmax-fp32-sse41-ld64.c
 
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qs8-gemm/gen/1x4c8-minmax-fp32-avx-ld64.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qs8-gemm/gen/2x4c8-minmax-fp32-avx-ld64.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qs8-gemm/gen/3x4c8-minmax-fp32-avx-ld64.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qc8-gemm/gen/1x4c8-minmax-fp32-sse41-ld64.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qc8-gemm/gen/2x4c8-minmax-fp32-sse41-ld64.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qc8-gemm/gen/3x4c8-minmax-fp32-sse41-ld64.c
 
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=1 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64     -o src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-xop-ld64.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=1 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64     -o src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-xop-ld64.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=1 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64     -o src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-xop-ld64.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64     -o src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-avx-ld64.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64     -o src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-avx-ld64.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64     -o src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-avx-ld64.c
 
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=1 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qs8-gemm/gen/1x4c8-minmax-fp32-xop-ld64.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=1 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qs8-gemm/gen/2x4c8-minmax-fp32-xop-ld64.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=1 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qs8-gemm/gen/3x4c8-minmax-fp32-xop-ld64.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qs8-gemm/gen/1x4c8-minmax-fp32-avx-ld64.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qs8-gemm/gen/2x4c8-minmax-fp32-avx-ld64.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qs8-gemm/gen/3x4c8-minmax-fp32-avx-ld64.c
 
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=2 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128    -o src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-sse2-ld128.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=2 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128    -o src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-sse2-ld128.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=2 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128    -o src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-sse2-ld128.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qc8-gemm/gen/1x4c8-minmax-fp32-avx-ld64.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qc8-gemm/gen/2x4c8-minmax-fp32-avx-ld64.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qc8-gemm/gen/3x4c8-minmax-fp32-avx-ld64.c
 
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=2 -D AVX=0 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qs8-gemm/gen/1x4c8-minmax-fp32-sse2-ld128.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=2 -D AVX=0 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qs8-gemm/gen/2x4c8-minmax-fp32-sse2-ld128.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=2 -D AVX=0 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qs8-gemm/gen/3x4c8-minmax-fp32-sse2-ld128.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64     -o src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-xop-ld64.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64     -o src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-xop-ld64.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64     -o src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-xop-ld64.c
 
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=3 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128    -o src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-ssse3-ld128.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=3 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128    -o src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-ssse3-ld128.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=3 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128    -o src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-ssse3-ld128.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qs8-gemm/gen/1x4c8-minmax-fp32-xop-ld64.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qs8-gemm/gen/2x4c8-minmax-fp32-xop-ld64.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qs8-gemm/gen/3x4c8-minmax-fp32-xop-ld64.c
 
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=3 -D AVX=0 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qs8-gemm/gen/1x4c8-minmax-fp32-ssse3-ld128.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=3 -D AVX=0 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qs8-gemm/gen/2x4c8-minmax-fp32-ssse3-ld128.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=3 -D AVX=0 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qs8-gemm/gen/3x4c8-minmax-fp32-ssse3-ld128.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qc8-gemm/gen/1x4c8-minmax-fp32-xop-ld64.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qc8-gemm/gen/2x4c8-minmax-fp32-xop-ld64.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD64     -o src/qc8-gemm/gen/3x4c8-minmax-fp32-xop-ld64.c
 
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128    -o src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-sse41-ld128.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128    -o src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-sse41-ld128.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128    -o src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-sse41-ld128.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128    -o src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-sse2-ld128.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128    -o src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-sse2-ld128.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128    -o src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-sse2-ld128.c
 
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=0 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qs8-gemm/gen/1x4c8-minmax-fp32-sse41-ld128.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=0 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qs8-gemm/gen/2x4c8-minmax-fp32-sse41-ld128.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=0 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qs8-gemm/gen/3x4c8-minmax-fp32-sse41-ld128.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qs8-gemm/gen/1x4c8-minmax-fp32-sse2-ld128.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qs8-gemm/gen/2x4c8-minmax-fp32-sse2-ld128.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qs8-gemm/gen/3x4c8-minmax-fp32-sse2-ld128.c
 
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128    -o src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-avx-ld128.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128    -o src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-avx-ld128.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128    -o src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-avx-ld128.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qc8-gemm/gen/1x4c8-minmax-fp32-sse2-ld128.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qc8-gemm/gen/2x4c8-minmax-fp32-sse2-ld128.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qc8-gemm/gen/3x4c8-minmax-fp32-sse2-ld128.c
 
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qs8-gemm/gen/1x4c8-minmax-fp32-avx-ld128.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qs8-gemm/gen/2x4c8-minmax-fp32-avx-ld128.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qs8-gemm/gen/3x4c8-minmax-fp32-avx-ld128.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=3 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128    -o src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-ssse3-ld128.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=3 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128    -o src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-ssse3-ld128.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=3 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128    -o src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-ssse3-ld128.c
 
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=1 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128    -o src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-xop-ld128.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=1 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128    -o src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-xop-ld128.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=1 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128    -o src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-xop-ld128.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=3 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qs8-gemm/gen/1x4c8-minmax-fp32-ssse3-ld128.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=3 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qs8-gemm/gen/2x4c8-minmax-fp32-ssse3-ld128.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=3 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qs8-gemm/gen/3x4c8-minmax-fp32-ssse3-ld128.c
 
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=1 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qs8-gemm/gen/1x4c8-minmax-fp32-xop-ld128.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=1 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qs8-gemm/gen/2x4c8-minmax-fp32-xop-ld128.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=1 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qs8-gemm/gen/3x4c8-minmax-fp32-xop-ld128.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128    -o src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-sse41-ld128.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128    -o src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-sse41-ld128.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128    -o src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-sse41-ld128.c
 
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=2 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-sse2.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=2 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-sse2.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=2 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-sse2.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qs8-gemm/gen/1x4c8-minmax-fp32-sse41-ld128.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qs8-gemm/gen/2x4c8-minmax-fp32-sse41-ld128.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qs8-gemm/gen/3x4c8-minmax-fp32-sse41-ld128.c
 
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=3 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-ssse3.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=3 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-ssse3.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=3 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-ssse3.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qc8-gemm/gen/1x4c8-minmax-fp32-sse41-ld128.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qc8-gemm/gen/2x4c8-minmax-fp32-sse41-ld128.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qc8-gemm/gen/3x4c8-minmax-fp32-sse41-ld128.c
 
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-sse41.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-sse41.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-sse41.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128    -o src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-avx-ld128.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128    -o src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-avx-ld128.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128    -o src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-avx-ld128.c
 
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-avx.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-avx.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-avx.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qs8-gemm/gen/1x4c8-minmax-fp32-avx-ld128.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qs8-gemm/gen/2x4c8-minmax-fp32-avx-ld128.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qs8-gemm/gen/3x4c8-minmax-fp32-avx-ld128.c
 
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=1 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-xop.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=1 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-xop.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=1 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-xop.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qc8-gemm/gen/1x4c8-minmax-fp32-avx-ld128.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qc8-gemm/gen/2x4c8-minmax-fp32-avx-ld128.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qc8-gemm/gen/3x4c8-minmax-fp32-avx-ld128.c
+
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128    -o src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-xop-ld128.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128    -o src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-xop-ld128.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128    -o src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-xop-ld128.c
+
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qs8-gemm/gen/1x4c8-minmax-fp32-xop-ld128.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qs8-gemm/gen/2x4c8-minmax-fp32-xop-ld128.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qs8-gemm/gen/3x4c8-minmax-fp32-xop-ld128.c
+
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qc8-gemm/gen/1x4c8-minmax-fp32-xop-ld128.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qc8-gemm/gen/2x4c8-minmax-fp32-xop-ld128.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qc8-gemm/gen/3x4c8-minmax-fp32-xop-ld128.c
+
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-sse2.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-sse2.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-sse2.c
+
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=3 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-ssse3.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=3 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-ssse3.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=3 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-ssse3.c
+
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-sse41.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-sse41.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-sse41.c
+
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-avx.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-avx.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-avx.c
+
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-xop.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-xop.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-xop.c
 
 ################################### x86 AVX2 ##################################
 ### C8 micro-kernels
diff --git a/scripts/generate-qs8-igemm.sh b/scripts/generate-qs8-igemm.sh
index d3b6584..34ef331 100755
--- a/scripts/generate-qs8-igemm.sh
+++ b/scripts/generate-qs8-igemm.sh
@@ -147,176 +147,248 @@
 
 ################################### x86 SSE ###################################
 ### C2 micro-kernels
-tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=1 -D SSE=2 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64  -o src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-sse2-ld64.c
-tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=2 -D SSE=2 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64  -o src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-sse2-ld64.c
-tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=3 -D SSE=2 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64  -o src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-sse2-ld64.c
-tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=4 -D SSE=2 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64  -o src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-sse2-ld64.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=1 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64  -o src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-sse2-ld64.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=2 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64  -o src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-sse2-ld64.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=3 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64  -o src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-sse2-ld64.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=4 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64  -o src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-sse2-ld64.c
 
-tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=1 -D SSE=2 -D AVX=0 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qs8-igemm/gen/1x4c2-minmax-fp32-sse2-ld64.c
-tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=2 -D SSE=2 -D AVX=0 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qs8-igemm/gen/2x4c2-minmax-fp32-sse2-ld64.c
-tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=3 -D SSE=2 -D AVX=0 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qs8-igemm/gen/3x4c2-minmax-fp32-sse2-ld64.c
-tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=4 -D SSE=2 -D AVX=0 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qs8-igemm/gen/4x4c2-minmax-fp32-sse2-ld64.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=1 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qs8-igemm/gen/1x4c2-minmax-fp32-sse2-ld64.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=2 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qs8-igemm/gen/2x4c2-minmax-fp32-sse2-ld64.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=3 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qs8-igemm/gen/3x4c2-minmax-fp32-sse2-ld64.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=4 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qs8-igemm/gen/4x4c2-minmax-fp32-sse2-ld64.c
 
-tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=1 -D SSE=3 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64  -o src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-ssse3-ld64.c
-tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=2 -D SSE=3 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64  -o src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-ssse3-ld64.c
-tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=3 -D SSE=3 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64  -o src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-ssse3-ld64.c
-tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=4 -D SSE=3 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64  -o src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-ssse3-ld64.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=1 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qc8-igemm/gen/1x4c2-minmax-fp32-sse2-ld64.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=2 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qc8-igemm/gen/2x4c2-minmax-fp32-sse2-ld64.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=3 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qc8-igemm/gen/3x4c2-minmax-fp32-sse2-ld64.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=4 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qc8-igemm/gen/4x4c2-minmax-fp32-sse2-ld64.c
 
-tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64  -o src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-sse41-ld64.c
-tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64  -o src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-sse41-ld64.c
-tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64  -o src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-sse41-ld64.c
-tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64  -o src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-sse41-ld64.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=1 -D SSE=3 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64  -o src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-ssse3-ld64.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=2 -D SSE=3 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64  -o src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-ssse3-ld64.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=3 -D SSE=3 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64  -o src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-ssse3-ld64.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=4 -D SSE=3 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64  -o src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-ssse3-ld64.c
 
-tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=0 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qs8-igemm/gen/1x4c2-minmax-fp32-sse41-ld64.c
-tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=0 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qs8-igemm/gen/2x4c2-minmax-fp32-sse41-ld64.c
-tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=0 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qs8-igemm/gen/3x4c2-minmax-fp32-sse41-ld64.c
-tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=0 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qs8-igemm/gen/4x4c2-minmax-fp32-sse41-ld64.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64  -o src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-sse41-ld64.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64  -o src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-sse41-ld64.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64  -o src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-sse41-ld64.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64  -o src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-sse41-ld64.c
 
-tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64  -o src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-avx-ld64.c
-tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64  -o src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-avx-ld64.c
-tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64  -o src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-avx-ld64.c
-tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=1 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64  -o src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-avx-ld64.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qs8-igemm/gen/1x4c2-minmax-fp32-sse41-ld64.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qs8-igemm/gen/2x4c2-minmax-fp32-sse41-ld64.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qs8-igemm/gen/3x4c2-minmax-fp32-sse41-ld64.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qs8-igemm/gen/4x4c2-minmax-fp32-sse41-ld64.c
 
-tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qs8-igemm/gen/1x4c2-minmax-fp32-avx-ld64.c
-tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qs8-igemm/gen/2x4c2-minmax-fp32-avx-ld64.c
-tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qs8-igemm/gen/3x4c2-minmax-fp32-avx-ld64.c
-tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=1 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qs8-igemm/gen/4x4c2-minmax-fp32-avx-ld64.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qc8-igemm/gen/1x4c2-minmax-fp32-sse41-ld64.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qc8-igemm/gen/2x4c2-minmax-fp32-sse41-ld64.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qc8-igemm/gen/3x4c2-minmax-fp32-sse41-ld64.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qc8-igemm/gen/4x4c2-minmax-fp32-sse41-ld64.c
 
-tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=1 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64  -o src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-xop-ld64.c
-tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=1 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64  -o src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-xop-ld64.c
-tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=1 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64  -o src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-xop-ld64.c
-tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=1 -D XOP=1 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64  -o src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-xop-ld64.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64  -o src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-avx-ld64.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64  -o src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-avx-ld64.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64  -o src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-avx-ld64.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64  -o src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-avx-ld64.c
 
-tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=1 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qs8-igemm/gen/1x4c2-minmax-fp32-xop-ld64.c
-tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=1 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qs8-igemm/gen/2x4c2-minmax-fp32-xop-ld64.c
-tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=1 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qs8-igemm/gen/3x4c2-minmax-fp32-xop-ld64.c
-tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=1 -D XOP=1 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qs8-igemm/gen/4x4c2-minmax-fp32-xop-ld64.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qs8-igemm/gen/1x4c2-minmax-fp32-avx-ld64.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qs8-igemm/gen/2x4c2-minmax-fp32-avx-ld64.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qs8-igemm/gen/3x4c2-minmax-fp32-avx-ld64.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qs8-igemm/gen/4x4c2-minmax-fp32-avx-ld64.c
 
-tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=1 -D SSE=2 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128 -o src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-sse2-ld128.c
-tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=2 -D SSE=2 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128 -o src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-sse2-ld128.c
-tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=3 -D SSE=2 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128 -o src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-sse2-ld128.c
-tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=4 -D SSE=2 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128 -o src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-sse2-ld128.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qc8-igemm/gen/1x4c2-minmax-fp32-avx-ld64.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qc8-igemm/gen/2x4c2-minmax-fp32-avx-ld64.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qc8-igemm/gen/3x4c2-minmax-fp32-avx-ld64.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qc8-igemm/gen/4x4c2-minmax-fp32-avx-ld64.c
 
-tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=1 -D SSE=2 -D AVX=0 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qs8-igemm/gen/1x4c2-minmax-fp32-sse2-ld128.c
-tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=2 -D SSE=2 -D AVX=0 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qs8-igemm/gen/2x4c2-minmax-fp32-sse2-ld128.c
-tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=3 -D SSE=2 -D AVX=0 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qs8-igemm/gen/3x4c2-minmax-fp32-sse2-ld128.c
-tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=4 -D SSE=2 -D AVX=0 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qs8-igemm/gen/4x4c2-minmax-fp32-sse2-ld128.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64  -o src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-xop-ld64.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64  -o src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-xop-ld64.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64  -o src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-xop-ld64.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64  -o src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-xop-ld64.c
 
-tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=1 -D SSE=3 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128 -o src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-ssse3-ld128.c
-tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=2 -D SSE=3 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128 -o src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-ssse3-ld128.c
-tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=3 -D SSE=3 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128 -o src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-ssse3-ld128.c
-tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=4 -D SSE=3 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128 -o src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-ssse3-ld128.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qs8-igemm/gen/1x4c2-minmax-fp32-xop-ld64.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qs8-igemm/gen/2x4c2-minmax-fp32-xop-ld64.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qs8-igemm/gen/3x4c2-minmax-fp32-xop-ld64.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qs8-igemm/gen/4x4c2-minmax-fp32-xop-ld64.c
 
-tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128 -o src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-sse41-ld128.c
-tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128 -o src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-sse41-ld128.c
-tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128 -o src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-sse41-ld128.c
-tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128 -o src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-sse41-ld128.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qc8-igemm/gen/1x4c2-minmax-fp32-xop-ld64.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qc8-igemm/gen/2x4c2-minmax-fp32-xop-ld64.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qc8-igemm/gen/3x4c2-minmax-fp32-xop-ld64.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qc8-igemm/gen/4x4c2-minmax-fp32-xop-ld64.c
 
-tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=0 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qs8-igemm/gen/1x4c2-minmax-fp32-sse41-ld128.c
-tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=0 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qs8-igemm/gen/2x4c2-minmax-fp32-sse41-ld128.c
-tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=0 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qs8-igemm/gen/3x4c2-minmax-fp32-sse41-ld128.c
-tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=0 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qs8-igemm/gen/4x4c2-minmax-fp32-sse41-ld128.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=1 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128 -o src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-sse2-ld128.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=2 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128 -o src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-sse2-ld128.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=3 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128 -o src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-sse2-ld128.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=4 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128 -o src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-sse2-ld128.c
 
-tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128 -o src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-avx-ld128.c
-tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128 -o src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-avx-ld128.c
-tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128 -o src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-avx-ld128.c
-tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=1 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128 -o src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-avx-ld128.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=1 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qs8-igemm/gen/1x4c2-minmax-fp32-sse2-ld128.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=2 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qs8-igemm/gen/2x4c2-minmax-fp32-sse2-ld128.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=3 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qs8-igemm/gen/3x4c2-minmax-fp32-sse2-ld128.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=4 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qs8-igemm/gen/4x4c2-minmax-fp32-sse2-ld128.c
 
-tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qs8-igemm/gen/1x4c2-minmax-fp32-avx-ld128.c
-tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qs8-igemm/gen/2x4c2-minmax-fp32-avx-ld128.c
-tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qs8-igemm/gen/3x4c2-minmax-fp32-avx-ld128.c
-tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=1 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qs8-igemm/gen/4x4c2-minmax-fp32-avx-ld128.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=1 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qc8-igemm/gen/1x4c2-minmax-fp32-sse2-ld128.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=2 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qc8-igemm/gen/2x4c2-minmax-fp32-sse2-ld128.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=3 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qc8-igemm/gen/3x4c2-minmax-fp32-sse2-ld128.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=4 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qc8-igemm/gen/4x4c2-minmax-fp32-sse2-ld128.c
 
-tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=1 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128 -o src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-xop-ld128.c
-tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=1 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128 -o src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-xop-ld128.c
-tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=1 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128 -o src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-xop-ld128.c
-tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=1 -D XOP=1 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128 -o src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-xop-ld128.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=1 -D SSE=3 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128 -o src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-ssse3-ld128.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=2 -D SSE=3 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128 -o src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-ssse3-ld128.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=3 -D SSE=3 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128 -o src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-ssse3-ld128.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=4 -D SSE=3 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128 -o src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-ssse3-ld128.c
 
-tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=1 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qs8-igemm/gen/1x4c2-minmax-fp32-xop-ld128.c
-tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=1 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qs8-igemm/gen/2x4c2-minmax-fp32-xop-ld128.c
-tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=1 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qs8-igemm/gen/3x4c2-minmax-fp32-xop-ld128.c
-tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=1 -D XOP=1 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qs8-igemm/gen/4x4c2-minmax-fp32-xop-ld128.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128 -o src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-sse41-ld128.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128 -o src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-sse41-ld128.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128 -o src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-sse41-ld128.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128 -o src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-sse41-ld128.c
+
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qs8-igemm/gen/1x4c2-minmax-fp32-sse41-ld128.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qs8-igemm/gen/2x4c2-minmax-fp32-sse41-ld128.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qs8-igemm/gen/3x4c2-minmax-fp32-sse41-ld128.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qs8-igemm/gen/4x4c2-minmax-fp32-sse41-ld128.c
+
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qc8-igemm/gen/1x4c2-minmax-fp32-sse41-ld128.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qc8-igemm/gen/2x4c2-minmax-fp32-sse41-ld128.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qc8-igemm/gen/3x4c2-minmax-fp32-sse41-ld128.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qc8-igemm/gen/4x4c2-minmax-fp32-sse41-ld128.c
+
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128 -o src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-avx-ld128.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128 -o src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-avx-ld128.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128 -o src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-avx-ld128.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128 -o src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-avx-ld128.c
+
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qs8-igemm/gen/1x4c2-minmax-fp32-avx-ld128.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qs8-igemm/gen/2x4c2-minmax-fp32-avx-ld128.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qs8-igemm/gen/3x4c2-minmax-fp32-avx-ld128.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qs8-igemm/gen/4x4c2-minmax-fp32-avx-ld128.c
+
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qc8-igemm/gen/1x4c2-minmax-fp32-avx-ld128.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qc8-igemm/gen/2x4c2-minmax-fp32-avx-ld128.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qc8-igemm/gen/3x4c2-minmax-fp32-avx-ld128.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qc8-igemm/gen/4x4c2-minmax-fp32-avx-ld128.c
+
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128 -o src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-xop-ld128.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128 -o src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-xop-ld128.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128 -o src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-xop-ld128.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128 -o src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-xop-ld128.c
+
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qs8-igemm/gen/1x4c2-minmax-fp32-xop-ld128.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qs8-igemm/gen/2x4c2-minmax-fp32-xop-ld128.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qs8-igemm/gen/3x4c2-minmax-fp32-xop-ld128.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qs8-igemm/gen/4x4c2-minmax-fp32-xop-ld128.c
+
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qc8-igemm/gen/1x4c2-minmax-fp32-xop-ld128.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qc8-igemm/gen/2x4c2-minmax-fp32-xop-ld128.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qc8-igemm/gen/3x4c2-minmax-fp32-xop-ld128.c
+tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qc8-igemm/gen/4x4c2-minmax-fp32-xop-ld128.c
 
 ### C8 micro-kernels
-tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=1 -D SSE=2 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64  -o src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-sse2-ld64.c
-tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=2 -D SSE=2 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64  -o src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-sse2-ld64.c
-tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=3 -D SSE=2 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64  -o src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-sse2-ld64.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=1 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64  -o src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-sse2-ld64.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=2 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64  -o src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-sse2-ld64.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=3 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64  -o src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-sse2-ld64.c
 
-tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=1 -D SSE=2 -D AVX=0 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qs8-igemm/gen/1x4c8-minmax-fp32-sse2-ld64.c
-tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=2 -D SSE=2 -D AVX=0 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qs8-igemm/gen/2x4c8-minmax-fp32-sse2-ld64.c
-tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=3 -D SSE=2 -D AVX=0 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qs8-igemm/gen/3x4c8-minmax-fp32-sse2-ld64.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=1 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qs8-igemm/gen/1x4c8-minmax-fp32-sse2-ld64.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=2 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qs8-igemm/gen/2x4c8-minmax-fp32-sse2-ld64.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=3 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qs8-igemm/gen/3x4c8-minmax-fp32-sse2-ld64.c
 
-tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=1 -D SSE=3 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64  -o src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-ssse3-ld64.c
-tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=2 -D SSE=3 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64  -o src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-ssse3-ld64.c
-tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=3 -D SSE=3 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64  -o src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-ssse3-ld64.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=1 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qc8-igemm/gen/1x4c8-minmax-fp32-sse2-ld64.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=2 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qc8-igemm/gen/2x4c8-minmax-fp32-sse2-ld64.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=3 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qc8-igemm/gen/3x4c8-minmax-fp32-sse2-ld64.c
 
-tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=1 -D SSE=3 -D AVX=0 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qs8-igemm/gen/1x4c8-minmax-fp32-ssse3-ld64.c
-tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=2 -D SSE=3 -D AVX=0 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qs8-igemm/gen/2x4c8-minmax-fp32-ssse3-ld64.c
-tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=3 -D SSE=3 -D AVX=0 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qs8-igemm/gen/3x4c8-minmax-fp32-ssse3-ld64.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=1 -D SSE=3 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64  -o src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-ssse3-ld64.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=2 -D SSE=3 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64  -o src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-ssse3-ld64.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=3 -D SSE=3 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64  -o src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-ssse3-ld64.c
 
-tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64  -o src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-sse41-ld64.c
-tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64  -o src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-sse41-ld64.c
-tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64  -o src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-sse41-ld64.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=1 -D SSE=3 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qs8-igemm/gen/1x4c8-minmax-fp32-ssse3-ld64.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=2 -D SSE=3 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qs8-igemm/gen/2x4c8-minmax-fp32-ssse3-ld64.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=3 -D SSE=3 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qs8-igemm/gen/3x4c8-minmax-fp32-ssse3-ld64.c
 
-tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=0 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qs8-igemm/gen/1x4c8-minmax-fp32-sse41-ld64.c
-tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=0 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qs8-igemm/gen/2x4c8-minmax-fp32-sse41-ld64.c
-tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=0 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qs8-igemm/gen/3x4c8-minmax-fp32-sse41-ld64.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64  -o src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-sse41-ld64.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64  -o src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-sse41-ld64.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64  -o src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-sse41-ld64.c
 
-tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64  -o src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-avx-ld64.c
-tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64  -o src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-avx-ld64.c
-tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64  -o src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-avx-ld64.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qs8-igemm/gen/1x4c8-minmax-fp32-sse41-ld64.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qs8-igemm/gen/2x4c8-minmax-fp32-sse41-ld64.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qs8-igemm/gen/3x4c8-minmax-fp32-sse41-ld64.c
 
-tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qs8-igemm/gen/1x4c8-minmax-fp32-avx-ld64.c
-tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qs8-igemm/gen/2x4c8-minmax-fp32-avx-ld64.c
-tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qs8-igemm/gen/3x4c8-minmax-fp32-avx-ld64.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qc8-igemm/gen/1x4c8-minmax-fp32-sse41-ld64.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qc8-igemm/gen/2x4c8-minmax-fp32-sse41-ld64.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qc8-igemm/gen/3x4c8-minmax-fp32-sse41-ld64.c
 
-tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=1 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64  -o src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-xop-ld64.c
-tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=1 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64  -o src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-xop-ld64.c
-tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=1 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64  -o src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-xop-ld64.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64  -o src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-avx-ld64.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64  -o src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-avx-ld64.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64  -o src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-avx-ld64.c
 
-tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=1 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qs8-igemm/gen/1x4c8-minmax-fp32-xop-ld64.c
-tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=1 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qs8-igemm/gen/2x4c8-minmax-fp32-xop-ld64.c
-tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=1 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qs8-igemm/gen/3x4c8-minmax-fp32-xop-ld64.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qs8-igemm/gen/1x4c8-minmax-fp32-avx-ld64.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qs8-igemm/gen/2x4c8-minmax-fp32-avx-ld64.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qs8-igemm/gen/3x4c8-minmax-fp32-avx-ld64.c
 
-tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=1 -D SSE=2 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128 -o src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-sse2-ld128.c
-tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=2 -D SSE=2 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128 -o src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-sse2-ld128.c
-tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=3 -D SSE=2 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128 -o src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-sse2-ld128.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qc8-igemm/gen/1x4c8-minmax-fp32-avx-ld64.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qc8-igemm/gen/2x4c8-minmax-fp32-avx-ld64.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qc8-igemm/gen/3x4c8-minmax-fp32-avx-ld64.c
 
-tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=1 -D SSE=2 -D AVX=0 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qs8-igemm/gen/1x4c8-minmax-fp32-sse2-ld128.c
-tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=2 -D SSE=2 -D AVX=0 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qs8-igemm/gen/2x4c8-minmax-fp32-sse2-ld128.c
-tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=3 -D SSE=2 -D AVX=0 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qs8-igemm/gen/3x4c8-minmax-fp32-sse2-ld128.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64  -o src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-xop-ld64.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64  -o src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-xop-ld64.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64  -o src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-xop-ld64.c
 
-tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=1 -D SSE=3 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128 -o src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-ssse3-ld128.c
-tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=2 -D SSE=3 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128 -o src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-ssse3-ld128.c
-tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=3 -D SSE=3 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128 -o src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-ssse3-ld128.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qs8-igemm/gen/1x4c8-minmax-fp32-xop-ld64.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qs8-igemm/gen/2x4c8-minmax-fp32-xop-ld64.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qs8-igemm/gen/3x4c8-minmax-fp32-xop-ld64.c
 
-tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=1 -D SSE=3 -D AVX=0 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qs8-igemm/gen/1x4c8-minmax-fp32-ssse3-ld128.c
-tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=2 -D SSE=3 -D AVX=0 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qs8-igemm/gen/2x4c8-minmax-fp32-ssse3-ld128.c
-tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=3 -D SSE=3 -D AVX=0 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qs8-igemm/gen/3x4c8-minmax-fp32-ssse3-ld128.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qc8-igemm/gen/1x4c8-minmax-fp32-xop-ld64.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qc8-igemm/gen/2x4c8-minmax-fp32-xop-ld64.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD64  -o src/qc8-igemm/gen/3x4c8-minmax-fp32-xop-ld64.c
 
-tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128 -o src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-sse41-ld128.c
-tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128 -o src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-sse41-ld128.c
-tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=0 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128 -o src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-sse41-ld128.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=1 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128 -o src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-sse2-ld128.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=2 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128 -o src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-sse2-ld128.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=3 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128 -o src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-sse2-ld128.c
 
-tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=0 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qs8-igemm/gen/1x4c8-minmax-fp32-sse41-ld128.c
-tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=0 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qs8-igemm/gen/2x4c8-minmax-fp32-sse41-ld128.c
-tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=0 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qs8-igemm/gen/3x4c8-minmax-fp32-sse41-ld128.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=1 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qs8-igemm/gen/1x4c8-minmax-fp32-sse2-ld128.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=2 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qs8-igemm/gen/2x4c8-minmax-fp32-sse2-ld128.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=3 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qs8-igemm/gen/3x4c8-minmax-fp32-sse2-ld128.c
 
-tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128 -o src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-avx-ld128.c
-tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128 -o src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-avx-ld128.c
-tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128 -o src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-avx-ld128.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=1 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qc8-igemm/gen/1x4c8-minmax-fp32-sse2-ld128.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=2 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qc8-igemm/gen/2x4c8-minmax-fp32-sse2-ld128.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=3 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qc8-igemm/gen/3x4c8-minmax-fp32-sse2-ld128.c
 
-tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qs8-igemm/gen/1x4c8-minmax-fp32-avx-ld128.c
-tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qs8-igemm/gen/2x4c8-minmax-fp32-avx-ld128.c
-tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qs8-igemm/gen/3x4c8-minmax-fp32-avx-ld128.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=1 -D SSE=3 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128 -o src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-ssse3-ld128.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=2 -D SSE=3 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128 -o src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-ssse3-ld128.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=3 -D SSE=3 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128 -o src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-ssse3-ld128.c
 
-tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=1 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128 -o src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-xop-ld128.c
-tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=1 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128 -o src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-xop-ld128.c
-tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=1 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128 -o src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-xop-ld128.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=1 -D SSE=3 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qs8-igemm/gen/1x4c8-minmax-fp32-ssse3-ld128.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=2 -D SSE=3 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qs8-igemm/gen/2x4c8-minmax-fp32-ssse3-ld128.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=3 -D SSE=3 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qs8-igemm/gen/3x4c8-minmax-fp32-ssse3-ld128.c
 
-tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=1 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qs8-igemm/gen/1x4c8-minmax-fp32-xop-ld128.c
-tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=1 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qs8-igemm/gen/2x4c8-minmax-fp32-xop-ld128.c
-tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=1 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qs8-igemm/gen/3x4c8-minmax-fp32-xop-ld128.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128 -o src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-sse41-ld128.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128 -o src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-sse41-ld128.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128 -o src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-sse41-ld128.c
+
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qs8-igemm/gen/1x4c8-minmax-fp32-sse41-ld128.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qs8-igemm/gen/2x4c8-minmax-fp32-sse41-ld128.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qs8-igemm/gen/3x4c8-minmax-fp32-sse41-ld128.c
+
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qc8-igemm/gen/1x4c8-minmax-fp32-sse41-ld128.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qc8-igemm/gen/2x4c8-minmax-fp32-sse41-ld128.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=0 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qc8-igemm/gen/3x4c8-minmax-fp32-sse41-ld128.c
+
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128 -o src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-avx-ld128.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128 -o src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-avx-ld128.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128 -o src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-avx-ld128.c
+
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qs8-igemm/gen/1x4c8-minmax-fp32-avx-ld128.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qs8-igemm/gen/2x4c8-minmax-fp32-avx-ld128.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qs8-igemm/gen/3x4c8-minmax-fp32-avx-ld128.c
+
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qc8-igemm/gen/1x4c8-minmax-fp32-avx-ld128.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qc8-igemm/gen/2x4c8-minmax-fp32-avx-ld128.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=0 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qc8-igemm/gen/3x4c8-minmax-fp32-avx-ld128.c
+
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128 -o src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-xop-ld128.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128 -o src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-xop-ld128.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD128 -o src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-xop-ld128.c
+
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qs8-igemm/gen/1x4c8-minmax-fp32-xop-ld128.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qs8-igemm/gen/2x4c8-minmax-fp32-xop-ld128.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=0 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qs8-igemm/gen/3x4c8-minmax-fp32-xop-ld128.c
+
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qc8-igemm/gen/1x4c8-minmax-fp32-xop-ld128.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qc8-igemm/gen/2x4c8-minmax-fp32-xop-ld128.c
+tools/xngen src/qs8-igemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=1 -D CHANNELWISE=1 -D REQUANTIZATION=FP32     -D VARIANT=LD128 -o src/qc8-igemm/gen/3x4c8-minmax-fp32-xop-ld128.c
 
 ################################### x86 AVX2 ##################################
 ### C8 micro-kernels
diff --git a/src/qc8-gemm/gen/1x4c2-minmax-fp32-avx-ld128.c b/src/qc8-gemm/gen/1x4c2-minmax-fp32-avx-ld128.c
new file mode 100644
index 0000000..f2e24a7
--- /dev/null
+++ b/src/qc8-gemm/gen/1x4c2-minmax-fp32-avx-ld128.c
@@ -0,0 +1,147 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/MRx4c2-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+
+void xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 1);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+
+  do {
+    __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t k = kc;
+    while (k >= 8 * sizeof(int8_t)) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      a0 += 8;
+
+      const __m128i vb01 = _mm_loadu_si128((const __m128i*) w);
+      const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
+      const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
+      const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+      const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
+      const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
+      const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
+      const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
+      k -= 8 * sizeof(int8_t);
+    }
+    if (k != 0) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      a0 = (const int8_t*) ((uintptr_t) a0 + k);
+
+      const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+      const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
+      w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+      if (k > 2 * sizeof(int8_t)) {
+        const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
+        const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
+        w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+
+        if (k > 4 * sizeof(int8_t)) {
+          const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
+          const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
+          w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          vacc0x0123 = _mm_add_epi32(vacc0x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+        }
+      }
+    }
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+
+    const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-gemm/gen/1x4c2-minmax-fp32-avx-ld64.c b/src/qc8-gemm/gen/1x4c2-minmax-fp32-avx-ld64.c
new file mode 100644
index 0000000..2a04fc8
--- /dev/null
+++ b/src/qc8-gemm/gen/1x4c2-minmax-fp32-avx-ld64.c
@@ -0,0 +1,147 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/MRx4c2-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+
+void xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 1);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+
+  do {
+    __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t k = kc;
+    while (k >= 8 * sizeof(int8_t)) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      a0 += 8;
+
+      const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+      const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int8_t)));
+      const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+      const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
+      const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+      const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int8_t)));
+      const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
+      k -= 8 * sizeof(int8_t);
+    }
+    if (k != 0) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      a0 = (const int8_t*) ((uintptr_t) a0 + k);
+
+      const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+      const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
+      w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+      if (k > 2 * sizeof(int8_t)) {
+        const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
+        const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
+        w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+
+        if (k > 4 * sizeof(int8_t)) {
+          const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
+          const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
+          w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          vacc0x0123 = _mm_add_epi32(vacc0x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+        }
+      }
+    }
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+
+    const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-gemm/gen/1x4c2-minmax-fp32-sse2-ld128.c b/src/qc8-gemm/gen/1x4c2-minmax-fp32-sse2-ld128.c
new file mode 100644
index 0000000..617fc79
--- /dev/null
+++ b/src/qc8-gemm/gen/1x4c2-minmax-fp32-sse2-ld128.c
@@ -0,0 +1,148 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/MRx4c2-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+
+void xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 1);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+
+  do {
+    __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t k = kc;
+    while (k >= 8 * sizeof(int8_t)) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_unpacklo_epi8(va0, _mm_cmpgt_epi8(_mm_setzero_si128(), va0));
+      a0 += 8;
+
+      const __m128i vb01 = _mm_loadu_si128((const __m128i*) w);
+      const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
+      const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
+      const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+      const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
+      const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
+      const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
+      const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
+      k -= 8 * sizeof(int8_t);
+    }
+    if (k != 0) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_unpacklo_epi8(va0, _mm_cmpgt_epi8(_mm_setzero_si128(), va0));
+      a0 = (const int8_t*) ((uintptr_t) a0 + k);
+
+      const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+      const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
+      w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+      if (k > 2 * sizeof(int8_t)) {
+        const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
+        const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
+        w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+
+        if (k > 4 * sizeof(int8_t)) {
+          const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
+          const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
+          w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          vacc0x0123 = _mm_add_epi32(vacc0x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+        }
+      }
+    }
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+
+    const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
+
+    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
+    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
+    vacc00x0123 = _mm_min_epi16(_mm_max_epi16(vacc00x0123, voutput_min), voutput_max);
+
+    __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
+
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-gemm/gen/1x4c2-minmax-fp32-sse2-ld64.c b/src/qc8-gemm/gen/1x4c2-minmax-fp32-sse2-ld64.c
new file mode 100644
index 0000000..ddc1095
--- /dev/null
+++ b/src/qc8-gemm/gen/1x4c2-minmax-fp32-sse2-ld64.c
@@ -0,0 +1,148 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/MRx4c2-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+
+void xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 1);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+
+  do {
+    __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t k = kc;
+    while (k >= 8 * sizeof(int8_t)) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_unpacklo_epi8(va0, _mm_cmpgt_epi8(_mm_setzero_si128(), va0));
+      a0 += 8;
+
+      const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+      const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int8_t)));
+      const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+      const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
+      const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+      const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int8_t)));
+      const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
+      k -= 8 * sizeof(int8_t);
+    }
+    if (k != 0) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_unpacklo_epi8(va0, _mm_cmpgt_epi8(_mm_setzero_si128(), va0));
+      a0 = (const int8_t*) ((uintptr_t) a0 + k);
+
+      const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+      const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
+      w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+      if (k > 2 * sizeof(int8_t)) {
+        const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
+        const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
+        w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+
+        if (k > 4 * sizeof(int8_t)) {
+          const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
+          const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
+          w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          vacc0x0123 = _mm_add_epi32(vacc0x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+        }
+      }
+    }
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+
+    const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
+
+    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
+    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
+    vacc00x0123 = _mm_min_epi16(_mm_max_epi16(vacc00x0123, voutput_min), voutput_max);
+
+    __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
+
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-gemm/gen/1x4c2-minmax-fp32-sse41-ld128.c b/src/qc8-gemm/gen/1x4c2-minmax-fp32-sse41-ld128.c
new file mode 100644
index 0000000..e213352
--- /dev/null
+++ b/src/qc8-gemm/gen/1x4c2-minmax-fp32-sse41-ld128.c
@@ -0,0 +1,147 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/MRx4c2-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+
+void xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 1);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+
+  do {
+    __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t k = kc;
+    while (k >= 8 * sizeof(int8_t)) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      a0 += 8;
+
+      const __m128i vb01 = _mm_loadu_si128((const __m128i*) w);
+      const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
+      const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
+      const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+      const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
+      const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
+      const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
+      const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
+      k -= 8 * sizeof(int8_t);
+    }
+    if (k != 0) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      a0 = (const int8_t*) ((uintptr_t) a0 + k);
+
+      const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+      const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
+      w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+      if (k > 2 * sizeof(int8_t)) {
+        const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
+        const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
+        w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+
+        if (k > 4 * sizeof(int8_t)) {
+          const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
+          const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
+          w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          vacc0x0123 = _mm_add_epi32(vacc0x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+        }
+      }
+    }
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+
+    const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-gemm/gen/1x4c2-minmax-fp32-sse41-ld64.c b/src/qc8-gemm/gen/1x4c2-minmax-fp32-sse41-ld64.c
new file mode 100644
index 0000000..048febc
--- /dev/null
+++ b/src/qc8-gemm/gen/1x4c2-minmax-fp32-sse41-ld64.c
@@ -0,0 +1,147 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/MRx4c2-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+
+void xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 1);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+
+  do {
+    __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t k = kc;
+    while (k >= 8 * sizeof(int8_t)) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      a0 += 8;
+
+      const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+      const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int8_t)));
+      const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+      const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
+      const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+      const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int8_t)));
+      const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
+      k -= 8 * sizeof(int8_t);
+    }
+    if (k != 0) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      a0 = (const int8_t*) ((uintptr_t) a0 + k);
+
+      const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+      const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
+      w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+      if (k > 2 * sizeof(int8_t)) {
+        const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
+        const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
+        w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+
+        if (k > 4 * sizeof(int8_t)) {
+          const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
+          const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
+          w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          vacc0x0123 = _mm_add_epi32(vacc0x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+        }
+      }
+    }
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+
+    const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-gemm/gen/1x4c2-minmax-fp32-xop-ld128.c b/src/qc8-gemm/gen/1x4c2-minmax-fp32-xop-ld128.c
new file mode 100644
index 0000000..6b65214
--- /dev/null
+++ b/src/qc8-gemm/gen/1x4c2-minmax-fp32-xop-ld128.c
@@ -0,0 +1,152 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/MRx4c2-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#if defined(__GNUC__) || defined(__clang__)
+  #include <x86intrin.h>
+#else
+  #include <immintrin.h>
+  #include <ammintrin.h>
+#endif
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+
+void xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 1);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+
+  do {
+    __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t k = kc;
+    while (k >= 8 * sizeof(int8_t)) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      a0 += 8;
+
+      const __m128i vb01 = _mm_loadu_si128((const __m128i*) w);
+      const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
+      const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
+      const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+
+      vacc0x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
+
+      vacc0x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
+      const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
+      const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
+      const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
+      const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+
+      vacc0x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
+
+      vacc0x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc0x0123);
+
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
+      k -= 8 * sizeof(int8_t);
+    }
+    if (k != 0) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      a0 = (const int8_t*) ((uintptr_t) a0 + k);
+
+      const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+      const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
+      w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      vacc0x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
+
+      if (k > 2 * sizeof(int8_t)) {
+        const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
+        const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
+        w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        vacc0x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
+
+        if (k > 4 * sizeof(int8_t)) {
+          const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
+          const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
+          w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          vacc0x0123 = _mm_maddd_epi16(
+            _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
+        }
+      }
+    }
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+
+    const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-gemm/gen/1x4c2-minmax-fp32-xop-ld64.c b/src/qc8-gemm/gen/1x4c2-minmax-fp32-xop-ld64.c
new file mode 100644
index 0000000..a182e89
--- /dev/null
+++ b/src/qc8-gemm/gen/1x4c2-minmax-fp32-xop-ld64.c
@@ -0,0 +1,152 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/MRx4c2-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#if defined(__GNUC__) || defined(__clang__)
+  #include <x86intrin.h>
+#else
+  #include <immintrin.h>
+  #include <ammintrin.h>
+#endif
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+
+void xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 1);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+
+  do {
+    __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t k = kc;
+    while (k >= 8 * sizeof(int8_t)) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      a0 += 8;
+
+      const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+      const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
+
+      vacc0x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
+      const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int8_t)));
+      const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
+
+      vacc0x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
+      const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
+      const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
+
+      vacc0x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
+      const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int8_t)));
+      const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
+
+      vacc0x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc0x0123);
+
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
+      k -= 8 * sizeof(int8_t);
+    }
+    if (k != 0) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      a0 = (const int8_t*) ((uintptr_t) a0 + k);
+
+      const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+      const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
+      w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      vacc0x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
+
+      if (k > 2 * sizeof(int8_t)) {
+        const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
+        const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
+        w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        vacc0x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
+
+        if (k > 4 * sizeof(int8_t)) {
+          const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
+          const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
+          w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          vacc0x0123 = _mm_maddd_epi16(
+            _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
+        }
+      }
+    }
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+
+    const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-gemm/gen/1x4c8-minmax-fp32-avx-ld128.c b/src/qc8-gemm/gen/1x4c8-minmax-fp32-avx-ld128.c
new file mode 100644
index 0000000..82f7100
--- /dev/null
+++ b/src/qc8-gemm/gen/1x4c8-minmax-fp32-avx-ld128.c
@@ -0,0 +1,118 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/MRx4c8-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 1);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+
+  do {
+    __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
+    __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
+    __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
+    __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t k = 0;
+    while (k < kc) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      a0 += 8;
+
+      const __m128i vb01 = _mm_load_si128((const __m128i*) w);
+      const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
+      const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
+      const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+
+      vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
+      vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
+      const __m128i vb23 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
+      const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
+      const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
+      const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+
+      vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
+      vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
+
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
+      k += 8 * sizeof(int8_t);
+    }
+
+    const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
+    const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
+
+    __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+
+    const __m128 vscale0123 = _mm_load_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-gemm/gen/1x4c8-minmax-fp32-avx-ld64.c b/src/qc8-gemm/gen/1x4c8-minmax-fp32-avx-ld64.c
new file mode 100644
index 0000000..709321b
--- /dev/null
+++ b/src/qc8-gemm/gen/1x4c8-minmax-fp32-avx-ld64.c
@@ -0,0 +1,120 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/MRx4c8-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 1);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+
+  do {
+    __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
+    __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
+    __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
+    __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t k = 0;
+    while (k < kc) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      a0 += 8;
+
+      const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+      const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
+
+      vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
+      const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int8_t)));
+      const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
+
+      vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
+      const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
+      const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
+
+      vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
+      const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int8_t)));
+      const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
+
+      vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
+
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
+      k += 8 * sizeof(int8_t);
+    }
+
+    const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
+    const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
+
+    __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+
+    const __m128 vscale0123 = _mm_load_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-gemm/gen/1x4c8-minmax-fp32-sse2-ld128.c b/src/qc8-gemm/gen/1x4c8-minmax-fp32-sse2-ld128.c
new file mode 100644
index 0000000..f82da93
--- /dev/null
+++ b/src/qc8-gemm/gen/1x4c8-minmax-fp32-sse2-ld128.c
@@ -0,0 +1,119 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/MRx4c8-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 1);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+
+  do {
+    __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
+    __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
+    __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
+    __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t k = 0;
+    while (k < kc) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_unpacklo_epi8(va0, _mm_cmpgt_epi8(_mm_setzero_si128(), va0));
+      a0 += 8;
+
+      const __m128i vb01 = _mm_load_si128((const __m128i*) w);
+      const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
+      const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
+      const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+
+      vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
+      vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
+      const __m128i vb23 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
+      const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
+      const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
+      const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+
+      vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
+      vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
+
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
+      k += 8 * sizeof(int8_t);
+    }
+
+    const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2));
+    const __m128i vacc0x13 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x1, vacc0x3), _mm_unpackhi_epi32(vacc0x1, vacc0x3));
+
+    __m128i vacc0x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x02, vacc0x13), _mm_unpackhi_epi32(vacc0x02, vacc0x13));
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+
+    const __m128 vscale0123 = _mm_load_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
+
+    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
+    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
+    vacc00x0123 = _mm_min_epi16(_mm_max_epi16(vacc00x0123, voutput_min), voutput_max);
+
+    __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
+
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-gemm/gen/1x4c8-minmax-fp32-sse2-ld64.c b/src/qc8-gemm/gen/1x4c8-minmax-fp32-sse2-ld64.c
new file mode 100644
index 0000000..dceacec
--- /dev/null
+++ b/src/qc8-gemm/gen/1x4c8-minmax-fp32-sse2-ld64.c
@@ -0,0 +1,121 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/MRx4c8-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 1);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+
+  do {
+    __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
+    __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
+    __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
+    __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t k = 0;
+    while (k < kc) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_unpacklo_epi8(va0, _mm_cmpgt_epi8(_mm_setzero_si128(), va0));
+      a0 += 8;
+
+      const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+      const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
+
+      vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
+      const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int8_t)));
+      const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
+
+      vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
+      const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
+      const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
+
+      vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
+      const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int8_t)));
+      const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
+
+      vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
+
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
+      k += 8 * sizeof(int8_t);
+    }
+
+    const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2));
+    const __m128i vacc0x13 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x1, vacc0x3), _mm_unpackhi_epi32(vacc0x1, vacc0x3));
+
+    __m128i vacc0x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x02, vacc0x13), _mm_unpackhi_epi32(vacc0x02, vacc0x13));
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+
+    const __m128 vscale0123 = _mm_load_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
+
+    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
+    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
+    vacc00x0123 = _mm_min_epi16(_mm_max_epi16(vacc00x0123, voutput_min), voutput_max);
+
+    __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
+
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-gemm/gen/1x4c8-minmax-fp32-sse41-ld128.c b/src/qc8-gemm/gen/1x4c8-minmax-fp32-sse41-ld128.c
new file mode 100644
index 0000000..4b0bffd
--- /dev/null
+++ b/src/qc8-gemm/gen/1x4c8-minmax-fp32-sse41-ld128.c
@@ -0,0 +1,118 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/MRx4c8-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 1);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+
+  do {
+    __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
+    __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
+    __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
+    __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t k = 0;
+    while (k < kc) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      a0 += 8;
+
+      const __m128i vb01 = _mm_load_si128((const __m128i*) w);
+      const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
+      const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
+      const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+
+      vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
+      vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
+      const __m128i vb23 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
+      const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
+      const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
+      const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+
+      vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
+      vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
+
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
+      k += 8 * sizeof(int8_t);
+    }
+
+    const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
+    const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
+
+    __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+
+    const __m128 vscale0123 = _mm_load_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-gemm/gen/1x4c8-minmax-fp32-sse41-ld64.c b/src/qc8-gemm/gen/1x4c8-minmax-fp32-sse41-ld64.c
new file mode 100644
index 0000000..0c1870e
--- /dev/null
+++ b/src/qc8-gemm/gen/1x4c8-minmax-fp32-sse41-ld64.c
@@ -0,0 +1,120 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/MRx4c8-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 1);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+
+  do {
+    __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
+    __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
+    __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
+    __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t k = 0;
+    while (k < kc) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      a0 += 8;
+
+      const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+      const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
+
+      vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
+      const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int8_t)));
+      const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
+
+      vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
+      const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
+      const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
+
+      vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
+      const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int8_t)));
+      const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
+
+      vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
+
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
+      k += 8 * sizeof(int8_t);
+    }
+
+    const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
+    const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
+
+    __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+
+    const __m128 vscale0123 = _mm_load_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-gemm/gen/1x4c8-minmax-fp32-xop-ld128.c b/src/qc8-gemm/gen/1x4c8-minmax-fp32-xop-ld128.c
new file mode 100644
index 0000000..239f15d
--- /dev/null
+++ b/src/qc8-gemm/gen/1x4c8-minmax-fp32-xop-ld128.c
@@ -0,0 +1,123 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/MRx4c8-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#if defined(__GNUC__) || defined(__clang__)
+  #include <x86intrin.h>
+#else
+  #include <immintrin.h>
+  #include <ammintrin.h>
+#endif
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 1);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+
+  do {
+    __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
+    __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
+    __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
+    __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t k = 0;
+    while (k < kc) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      a0 += 8;
+
+      const __m128i vb01 = _mm_load_si128((const __m128i*) w);
+      const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
+      const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
+      const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+
+      vacc0x0 = _mm_maddd_epi16(vxa0, vxb0, vacc0x0);
+      vacc0x1 = _mm_maddd_epi16(vxa0, vxb1, vacc0x1);
+      const __m128i vb23 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
+      const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
+      const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
+      const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+
+      vacc0x2 = _mm_maddd_epi16(vxa0, vxb2, vacc0x2);
+      vacc0x3 = _mm_maddd_epi16(vxa0, vxb3, vacc0x3);
+
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
+      k += 8 * sizeof(int8_t);
+    }
+
+    const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
+    const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
+
+    __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+
+    const __m128 vscale0123 = _mm_load_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-gemm/gen/1x4c8-minmax-fp32-xop-ld64.c b/src/qc8-gemm/gen/1x4c8-minmax-fp32-xop-ld64.c
new file mode 100644
index 0000000..9acbca1
--- /dev/null
+++ b/src/qc8-gemm/gen/1x4c8-minmax-fp32-xop-ld64.c
@@ -0,0 +1,125 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/MRx4c8-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#if defined(__GNUC__) || defined(__clang__)
+  #include <x86intrin.h>
+#else
+  #include <immintrin.h>
+  #include <ammintrin.h>
+#endif
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 1);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+
+  do {
+    __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
+    __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
+    __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
+    __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t k = 0;
+    while (k < kc) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      a0 += 8;
+
+      const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+      const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
+
+      vacc0x0 = _mm_maddd_epi16(vxa0, vxb0, vacc0x0);
+      const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int8_t)));
+      const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
+
+      vacc0x1 = _mm_maddd_epi16(vxa0, vxb1, vacc0x1);
+      const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
+      const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
+
+      vacc0x2 = _mm_maddd_epi16(vxa0, vxb2, vacc0x2);
+      const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int8_t)));
+      const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
+
+      vacc0x3 = _mm_maddd_epi16(vxa0, vxb3, vacc0x3);
+
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
+      k += 8 * sizeof(int8_t);
+    }
+
+    const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
+    const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
+
+    __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+
+    const __m128 vscale0123 = _mm_load_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-gemm/gen/2x4c2-minmax-fp32-avx-ld128.c b/src/qc8-gemm/gen/2x4c2-minmax-fp32-avx-ld128.c
new file mode 100644
index 0000000..b208c3b
--- /dev/null
+++ b/src/qc8-gemm/gen/2x4c2-minmax-fp32-avx-ld128.c
@@ -0,0 +1,183 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/MRx4c2-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+
+void xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 2);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+
+  do {
+    __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
+    __m128i vacc1x0123 = vacc0x0123;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t k = kc;
+    while (k >= 8 * sizeof(int8_t)) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      a0 += 8;
+      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+      const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+      a1 += 8;
+
+      const __m128i vb01 = _mm_loadu_si128((const __m128i*) w);
+      const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
+      const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
+      const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+      const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
+      const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
+      const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
+      const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
+      k -= 8 * sizeof(int8_t);
+    }
+    if (k != 0) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      a0 = (const int8_t*) ((uintptr_t) a0 + k);
+      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+      const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+      a1 = (const int8_t*) ((uintptr_t) a1 + k);
+
+      const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+      const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
+      w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+      if (k > 2 * sizeof(int8_t)) {
+        const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
+        const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
+        w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+
+        if (k > 4 * sizeof(int8_t)) {
+          const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
+          const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
+          w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          vacc0x0123 = _mm_add_epi32(vacc0x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+          vacc1x0123 = _mm_add_epi32(vacc1x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+        }
+      }
+    }
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+
+    const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-gemm/gen/2x4c2-minmax-fp32-avx-ld64.c b/src/qc8-gemm/gen/2x4c2-minmax-fp32-avx-ld64.c
new file mode 100644
index 0000000..701c66d
--- /dev/null
+++ b/src/qc8-gemm/gen/2x4c2-minmax-fp32-avx-ld64.c
@@ -0,0 +1,183 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/MRx4c2-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+
+void xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 2);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+
+  do {
+    __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
+    __m128i vacc1x0123 = vacc0x0123;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t k = kc;
+    while (k >= 8 * sizeof(int8_t)) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      a0 += 8;
+      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+      const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+      a1 += 8;
+
+      const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+      const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int8_t)));
+      const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+      const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
+      const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+      const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int8_t)));
+      const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
+      k -= 8 * sizeof(int8_t);
+    }
+    if (k != 0) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      a0 = (const int8_t*) ((uintptr_t) a0 + k);
+      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+      const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+      a1 = (const int8_t*) ((uintptr_t) a1 + k);
+
+      const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+      const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
+      w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+      if (k > 2 * sizeof(int8_t)) {
+        const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
+        const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
+        w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+
+        if (k > 4 * sizeof(int8_t)) {
+          const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
+          const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
+          w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          vacc0x0123 = _mm_add_epi32(vacc0x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+          vacc1x0123 = _mm_add_epi32(vacc1x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+        }
+      }
+    }
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+
+    const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-gemm/gen/2x4c2-minmax-fp32-sse2-ld128.c b/src/qc8-gemm/gen/2x4c2-minmax-fp32-sse2-ld128.c
new file mode 100644
index 0000000..f232688
--- /dev/null
+++ b/src/qc8-gemm/gen/2x4c2-minmax-fp32-sse2-ld128.c
@@ -0,0 +1,185 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/MRx4c2-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+
+void xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 2);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+
+  do {
+    __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
+    __m128i vacc1x0123 = vacc0x0123;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t k = kc;
+    while (k >= 8 * sizeof(int8_t)) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_unpacklo_epi8(va0, _mm_cmpgt_epi8(_mm_setzero_si128(), va0));
+      a0 += 8;
+      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+      const __m128i vxa1 = _mm_unpacklo_epi8(va1, _mm_cmpgt_epi8(_mm_setzero_si128(), va1));
+      a1 += 8;
+
+      const __m128i vb01 = _mm_loadu_si128((const __m128i*) w);
+      const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
+      const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
+      const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+      const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
+      const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
+      const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
+      const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
+      k -= 8 * sizeof(int8_t);
+    }
+    if (k != 0) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_unpacklo_epi8(va0, _mm_cmpgt_epi8(_mm_setzero_si128(), va0));
+      a0 = (const int8_t*) ((uintptr_t) a0 + k);
+      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+      const __m128i vxa1 = _mm_unpacklo_epi8(va1, _mm_cmpgt_epi8(_mm_setzero_si128(), va1));
+      a1 = (const int8_t*) ((uintptr_t) a1 + k);
+
+      const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+      const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
+      w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+      if (k > 2 * sizeof(int8_t)) {
+        const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
+        const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
+        w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+
+        if (k > 4 * sizeof(int8_t)) {
+          const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
+          const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
+          w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          vacc0x0123 = _mm_add_epi32(vacc0x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+          vacc1x0123 = _mm_add_epi32(vacc1x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+        }
+      }
+    }
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+
+    const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+
+    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
+    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
+    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
+
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      vout = _mm_srli_si128(vout, 4);
+      *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(vout);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-gemm/gen/2x4c2-minmax-fp32-sse2-ld64.c b/src/qc8-gemm/gen/2x4c2-minmax-fp32-sse2-ld64.c
new file mode 100644
index 0000000..cbfea83
--- /dev/null
+++ b/src/qc8-gemm/gen/2x4c2-minmax-fp32-sse2-ld64.c
@@ -0,0 +1,185 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/MRx4c2-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+
+void xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 2);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+
+  do {
+    __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
+    __m128i vacc1x0123 = vacc0x0123;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t k = kc;
+    while (k >= 8 * sizeof(int8_t)) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_unpacklo_epi8(va0, _mm_cmpgt_epi8(_mm_setzero_si128(), va0));
+      a0 += 8;
+      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+      const __m128i vxa1 = _mm_unpacklo_epi8(va1, _mm_cmpgt_epi8(_mm_setzero_si128(), va1));
+      a1 += 8;
+
+      const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+      const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int8_t)));
+      const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+      const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
+      const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+      const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int8_t)));
+      const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
+      k -= 8 * sizeof(int8_t);
+    }
+    if (k != 0) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_unpacklo_epi8(va0, _mm_cmpgt_epi8(_mm_setzero_si128(), va0));
+      a0 = (const int8_t*) ((uintptr_t) a0 + k);
+      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+      const __m128i vxa1 = _mm_unpacklo_epi8(va1, _mm_cmpgt_epi8(_mm_setzero_si128(), va1));
+      a1 = (const int8_t*) ((uintptr_t) a1 + k);
+
+      const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+      const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
+      w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+      if (k > 2 * sizeof(int8_t)) {
+        const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
+        const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
+        w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+
+        if (k > 4 * sizeof(int8_t)) {
+          const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
+          const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
+          w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          vacc0x0123 = _mm_add_epi32(vacc0x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+          vacc1x0123 = _mm_add_epi32(vacc1x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+        }
+      }
+    }
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+
+    const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+
+    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
+    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
+    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
+
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      vout = _mm_srli_si128(vout, 4);
+      *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(vout);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-gemm/gen/2x4c2-minmax-fp32-sse41-ld128.c b/src/qc8-gemm/gen/2x4c2-minmax-fp32-sse41-ld128.c
new file mode 100644
index 0000000..8ace15a
--- /dev/null
+++ b/src/qc8-gemm/gen/2x4c2-minmax-fp32-sse41-ld128.c
@@ -0,0 +1,183 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/MRx4c2-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+
+void xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 2);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+
+  do {
+    __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
+    __m128i vacc1x0123 = vacc0x0123;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t k = kc;
+    while (k >= 8 * sizeof(int8_t)) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      a0 += 8;
+      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+      const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+      a1 += 8;
+
+      const __m128i vb01 = _mm_loadu_si128((const __m128i*) w);
+      const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
+      const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
+      const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+      const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
+      const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
+      const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
+      const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
+      k -= 8 * sizeof(int8_t);
+    }
+    if (k != 0) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      a0 = (const int8_t*) ((uintptr_t) a0 + k);
+      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+      const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+      a1 = (const int8_t*) ((uintptr_t) a1 + k);
+
+      const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+      const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
+      w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+      if (k > 2 * sizeof(int8_t)) {
+        const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
+        const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
+        w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+
+        if (k > 4 * sizeof(int8_t)) {
+          const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
+          const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
+          w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          vacc0x0123 = _mm_add_epi32(vacc0x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+          vacc1x0123 = _mm_add_epi32(vacc1x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+        }
+      }
+    }
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+
+    const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-gemm/gen/2x4c2-minmax-fp32-sse41-ld64.c b/src/qc8-gemm/gen/2x4c2-minmax-fp32-sse41-ld64.c
new file mode 100644
index 0000000..f1c07c1
--- /dev/null
+++ b/src/qc8-gemm/gen/2x4c2-minmax-fp32-sse41-ld64.c
@@ -0,0 +1,183 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/MRx4c2-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+
+void xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 2);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+
+  do {
+    __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
+    __m128i vacc1x0123 = vacc0x0123;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t k = kc;
+    while (k >= 8 * sizeof(int8_t)) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      a0 += 8;
+      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+      const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+      a1 += 8;
+
+      const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+      const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int8_t)));
+      const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+      const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
+      const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+      const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int8_t)));
+      const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
+      k -= 8 * sizeof(int8_t);
+    }
+    if (k != 0) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      a0 = (const int8_t*) ((uintptr_t) a0 + k);
+      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+      const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+      a1 = (const int8_t*) ((uintptr_t) a1 + k);
+
+      const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+      const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
+      w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+      if (k > 2 * sizeof(int8_t)) {
+        const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
+        const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
+        w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+
+        if (k > 4 * sizeof(int8_t)) {
+          const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
+          const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
+          w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          vacc0x0123 = _mm_add_epi32(vacc0x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+          vacc1x0123 = _mm_add_epi32(vacc1x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+        }
+      }
+    }
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+
+    const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-gemm/gen/2x4c2-minmax-fp32-xop-ld128.c b/src/qc8-gemm/gen/2x4c2-minmax-fp32-xop-ld128.c
new file mode 100644
index 0000000..7dcb9d9
--- /dev/null
+++ b/src/qc8-gemm/gen/2x4c2-minmax-fp32-xop-ld128.c
@@ -0,0 +1,188 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/MRx4c2-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#if defined(__GNUC__) || defined(__clang__)
+  #include <x86intrin.h>
+#else
+  #include <immintrin.h>
+  #include <ammintrin.h>
+#endif
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+
+void xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 2);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+
+  do {
+    __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
+    __m128i vacc1x0123 = vacc0x0123;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t k = kc;
+    while (k >= 8 * sizeof(int8_t)) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      a0 += 8;
+      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+      const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+      a1 += 8;
+
+      const __m128i vb01 = _mm_loadu_si128((const __m128i*) w);
+      const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
+      const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
+      const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+
+      vacc0x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
+      vacc1x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc1x0123);
+
+      vacc0x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
+      vacc1x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc1x0123);
+      const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
+      const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
+      const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
+      const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+
+      vacc0x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
+      vacc1x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc1x0123);
+
+      vacc0x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc0x0123);
+      vacc1x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc1x0123);
+
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
+      k -= 8 * sizeof(int8_t);
+    }
+    if (k != 0) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      a0 = (const int8_t*) ((uintptr_t) a0 + k);
+      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+      const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+      a1 = (const int8_t*) ((uintptr_t) a1 + k);
+
+      const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+      const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
+      w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      vacc0x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
+      vacc1x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc1x0123);
+
+      if (k > 2 * sizeof(int8_t)) {
+        const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
+        const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
+        w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        vacc0x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
+        vacc1x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc1x0123);
+
+        if (k > 4 * sizeof(int8_t)) {
+          const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
+          const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
+          w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          vacc0x0123 = _mm_maddd_epi16(
+            _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
+          vacc1x0123 = _mm_maddd_epi16(
+            _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc1x0123);
+        }
+      }
+    }
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+
+    const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-gemm/gen/2x4c2-minmax-fp32-xop-ld64.c b/src/qc8-gemm/gen/2x4c2-minmax-fp32-xop-ld64.c
new file mode 100644
index 0000000..ea2a328
--- /dev/null
+++ b/src/qc8-gemm/gen/2x4c2-minmax-fp32-xop-ld64.c
@@ -0,0 +1,188 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/MRx4c2-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#if defined(__GNUC__) || defined(__clang__)
+  #include <x86intrin.h>
+#else
+  #include <immintrin.h>
+  #include <ammintrin.h>
+#endif
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+
+void xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 2);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+
+  do {
+    __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
+    __m128i vacc1x0123 = vacc0x0123;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t k = kc;
+    while (k >= 8 * sizeof(int8_t)) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      a0 += 8;
+      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+      const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+      a1 += 8;
+
+      const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+      const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
+
+      vacc0x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
+      vacc1x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc1x0123);
+      const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int8_t)));
+      const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
+
+      vacc0x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
+      vacc1x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc1x0123);
+      const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
+      const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
+
+      vacc0x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
+      vacc1x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc1x0123);
+      const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int8_t)));
+      const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
+
+      vacc0x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc0x0123);
+      vacc1x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc1x0123);
+
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
+      k -= 8 * sizeof(int8_t);
+    }
+    if (k != 0) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      a0 = (const int8_t*) ((uintptr_t) a0 + k);
+      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+      const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+      a1 = (const int8_t*) ((uintptr_t) a1 + k);
+
+      const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+      const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
+      w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      vacc0x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
+      vacc1x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc1x0123);
+
+      if (k > 2 * sizeof(int8_t)) {
+        const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
+        const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
+        w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        vacc0x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
+        vacc1x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc1x0123);
+
+        if (k > 4 * sizeof(int8_t)) {
+          const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
+          const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
+          w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          vacc0x0123 = _mm_maddd_epi16(
+            _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
+          vacc1x0123 = _mm_maddd_epi16(
+            _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc1x0123);
+        }
+      }
+    }
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+
+    const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-gemm/gen/2x4c8-minmax-fp32-avx-ld128.c b/src/qc8-gemm/gen/2x4c8-minmax-fp32-avx-ld128.c
new file mode 100644
index 0000000..3595651
--- /dev/null
+++ b/src/qc8-gemm/gen/2x4c8-minmax-fp32-avx-ld128.c
@@ -0,0 +1,147 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/MRx4c8-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 2);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+
+  do {
+    __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
+    __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
+    __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
+    __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
+    __m128i vacc1x0 = vacc0x0;
+    __m128i vacc1x1 = vacc0x1;
+    __m128i vacc1x2 = vacc0x2;
+    __m128i vacc1x3 = vacc0x3;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t k = 0;
+    while (k < kc) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      a0 += 8;
+      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+      const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+      a1 += 8;
+
+      const __m128i vb01 = _mm_load_si128((const __m128i*) w);
+      const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
+      const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
+      const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+
+      vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
+      vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
+      vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
+      vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
+      const __m128i vb23 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
+      const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
+      const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
+      const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+
+      vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
+      vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
+      vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
+      vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
+
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
+      k += 8 * sizeof(int8_t);
+    }
+
+    const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
+    const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
+    const __m128i vacc1x01 = _mm_hadd_epi32(vacc1x0, vacc1x1);
+    const __m128i vacc1x23 = _mm_hadd_epi32(vacc1x2, vacc1x3);
+
+    __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
+    __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+
+    const __m128 vscale0123 = _mm_load_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-gemm/gen/2x4c8-minmax-fp32-avx-ld64.c b/src/qc8-gemm/gen/2x4c8-minmax-fp32-avx-ld64.c
new file mode 100644
index 0000000..d9b0072
--- /dev/null
+++ b/src/qc8-gemm/gen/2x4c8-minmax-fp32-avx-ld64.c
@@ -0,0 +1,149 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/MRx4c8-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 2);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+
+  do {
+    __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
+    __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
+    __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
+    __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
+    __m128i vacc1x0 = vacc0x0;
+    __m128i vacc1x1 = vacc0x1;
+    __m128i vacc1x2 = vacc0x2;
+    __m128i vacc1x3 = vacc0x3;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t k = 0;
+    while (k < kc) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      a0 += 8;
+      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+      const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+      a1 += 8;
+
+      const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+      const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
+
+      vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
+      vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
+      const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int8_t)));
+      const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
+
+      vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
+      vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
+      const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
+      const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
+
+      vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
+      vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
+      const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int8_t)));
+      const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
+
+      vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
+      vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
+
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
+      k += 8 * sizeof(int8_t);
+    }
+
+    const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
+    const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
+    const __m128i vacc1x01 = _mm_hadd_epi32(vacc1x0, vacc1x1);
+    const __m128i vacc1x23 = _mm_hadd_epi32(vacc1x2, vacc1x3);
+
+    __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
+    __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+
+    const __m128 vscale0123 = _mm_load_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-gemm/gen/2x4c8-minmax-fp32-sse2-ld128.c b/src/qc8-gemm/gen/2x4c8-minmax-fp32-sse2-ld128.c
new file mode 100644
index 0000000..7ab73a6
--- /dev/null
+++ b/src/qc8-gemm/gen/2x4c8-minmax-fp32-sse2-ld128.c
@@ -0,0 +1,149 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/MRx4c8-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 2);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+
+  do {
+    __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
+    __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
+    __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
+    __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
+    __m128i vacc1x0 = vacc0x0;
+    __m128i vacc1x1 = vacc0x1;
+    __m128i vacc1x2 = vacc0x2;
+    __m128i vacc1x3 = vacc0x3;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t k = 0;
+    while (k < kc) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_unpacklo_epi8(va0, _mm_cmpgt_epi8(_mm_setzero_si128(), va0));
+      a0 += 8;
+      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+      const __m128i vxa1 = _mm_unpacklo_epi8(va1, _mm_cmpgt_epi8(_mm_setzero_si128(), va1));
+      a1 += 8;
+
+      const __m128i vb01 = _mm_load_si128((const __m128i*) w);
+      const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
+      const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
+      const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+
+      vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
+      vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
+      vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
+      vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
+      const __m128i vb23 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
+      const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
+      const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
+      const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+
+      vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
+      vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
+      vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
+      vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
+
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
+      k += 8 * sizeof(int8_t);
+    }
+
+    const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2));
+    const __m128i vacc0x13 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x1, vacc0x3), _mm_unpackhi_epi32(vacc0x1, vacc0x3));
+    const __m128i vacc1x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc1x0, vacc1x2), _mm_unpackhi_epi32(vacc1x0, vacc1x2));
+    const __m128i vacc1x13 = _mm_add_epi32(_mm_unpacklo_epi32(vacc1x1, vacc1x3), _mm_unpackhi_epi32(vacc1x1, vacc1x3));
+
+    __m128i vacc0x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x02, vacc0x13), _mm_unpackhi_epi32(vacc0x02, vacc0x13));
+    __m128i vacc1x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc1x02, vacc1x13), _mm_unpackhi_epi32(vacc1x02, vacc1x13));
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+
+    const __m128 vscale0123 = _mm_load_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+
+    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
+    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
+    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
+
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      vout = _mm_srli_si128(vout, 4);
+      *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(vout);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-gemm/gen/2x4c8-minmax-fp32-sse2-ld64.c b/src/qc8-gemm/gen/2x4c8-minmax-fp32-sse2-ld64.c
new file mode 100644
index 0000000..e205ebe
--- /dev/null
+++ b/src/qc8-gemm/gen/2x4c8-minmax-fp32-sse2-ld64.c
@@ -0,0 +1,151 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/MRx4c8-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 2);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+
+  do {
+    __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
+    __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
+    __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
+    __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
+    __m128i vacc1x0 = vacc0x0;
+    __m128i vacc1x1 = vacc0x1;
+    __m128i vacc1x2 = vacc0x2;
+    __m128i vacc1x3 = vacc0x3;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t k = 0;
+    while (k < kc) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_unpacklo_epi8(va0, _mm_cmpgt_epi8(_mm_setzero_si128(), va0));
+      a0 += 8;
+      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+      const __m128i vxa1 = _mm_unpacklo_epi8(va1, _mm_cmpgt_epi8(_mm_setzero_si128(), va1));
+      a1 += 8;
+
+      const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+      const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
+
+      vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
+      vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
+      const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int8_t)));
+      const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
+
+      vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
+      vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
+      const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
+      const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
+
+      vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
+      vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
+      const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int8_t)));
+      const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
+
+      vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
+      vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
+
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
+      k += 8 * sizeof(int8_t);
+    }
+
+    const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2));
+    const __m128i vacc0x13 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x1, vacc0x3), _mm_unpackhi_epi32(vacc0x1, vacc0x3));
+    const __m128i vacc1x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc1x0, vacc1x2), _mm_unpackhi_epi32(vacc1x0, vacc1x2));
+    const __m128i vacc1x13 = _mm_add_epi32(_mm_unpacklo_epi32(vacc1x1, vacc1x3), _mm_unpackhi_epi32(vacc1x1, vacc1x3));
+
+    __m128i vacc0x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x02, vacc0x13), _mm_unpackhi_epi32(vacc0x02, vacc0x13));
+    __m128i vacc1x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc1x02, vacc1x13), _mm_unpackhi_epi32(vacc1x02, vacc1x13));
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+
+    const __m128 vscale0123 = _mm_load_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+
+    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
+    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
+    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
+
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      vout = _mm_srli_si128(vout, 4);
+      *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(vout);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-gemm/gen/2x4c8-minmax-fp32-sse41-ld128.c b/src/qc8-gemm/gen/2x4c8-minmax-fp32-sse41-ld128.c
new file mode 100644
index 0000000..7fa45b8
--- /dev/null
+++ b/src/qc8-gemm/gen/2x4c8-minmax-fp32-sse41-ld128.c
@@ -0,0 +1,147 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/MRx4c8-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 2);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+
+  do {
+    __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
+    __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
+    __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
+    __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
+    __m128i vacc1x0 = vacc0x0;
+    __m128i vacc1x1 = vacc0x1;
+    __m128i vacc1x2 = vacc0x2;
+    __m128i vacc1x3 = vacc0x3;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t k = 0;
+    while (k < kc) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      a0 += 8;
+      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+      const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+      a1 += 8;
+
+      const __m128i vb01 = _mm_load_si128((const __m128i*) w);
+      const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
+      const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
+      const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+
+      vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
+      vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
+      vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
+      vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
+      const __m128i vb23 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
+      const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
+      const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
+      const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+
+      vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
+      vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
+      vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
+      vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
+
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
+      k += 8 * sizeof(int8_t);
+    }
+
+    const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
+    const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
+    const __m128i vacc1x01 = _mm_hadd_epi32(vacc1x0, vacc1x1);
+    const __m128i vacc1x23 = _mm_hadd_epi32(vacc1x2, vacc1x3);
+
+    __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
+    __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+
+    const __m128 vscale0123 = _mm_load_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-gemm/gen/2x4c8-minmax-fp32-sse41-ld64.c b/src/qc8-gemm/gen/2x4c8-minmax-fp32-sse41-ld64.c
new file mode 100644
index 0000000..d758e6c
--- /dev/null
+++ b/src/qc8-gemm/gen/2x4c8-minmax-fp32-sse41-ld64.c
@@ -0,0 +1,149 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/MRx4c8-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 2);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+
+  do {
+    __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
+    __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
+    __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
+    __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
+    __m128i vacc1x0 = vacc0x0;
+    __m128i vacc1x1 = vacc0x1;
+    __m128i vacc1x2 = vacc0x2;
+    __m128i vacc1x3 = vacc0x3;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t k = 0;
+    while (k < kc) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      a0 += 8;
+      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+      const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+      a1 += 8;
+
+      const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+      const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
+
+      vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
+      vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
+      const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int8_t)));
+      const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
+
+      vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
+      vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
+      const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
+      const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
+
+      vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
+      vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
+      const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int8_t)));
+      const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
+
+      vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
+      vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
+
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
+      k += 8 * sizeof(int8_t);
+    }
+
+    const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
+    const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
+    const __m128i vacc1x01 = _mm_hadd_epi32(vacc1x0, vacc1x1);
+    const __m128i vacc1x23 = _mm_hadd_epi32(vacc1x2, vacc1x3);
+
+    __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
+    __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+
+    const __m128 vscale0123 = _mm_load_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-gemm/gen/2x4c8-minmax-fp32-xop-ld128.c b/src/qc8-gemm/gen/2x4c8-minmax-fp32-xop-ld128.c
new file mode 100644
index 0000000..33313e4
--- /dev/null
+++ b/src/qc8-gemm/gen/2x4c8-minmax-fp32-xop-ld128.c
@@ -0,0 +1,152 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/MRx4c8-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#if defined(__GNUC__) || defined(__clang__)
+  #include <x86intrin.h>
+#else
+  #include <immintrin.h>
+  #include <ammintrin.h>
+#endif
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 2);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+
+  do {
+    __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
+    __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
+    __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
+    __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
+    __m128i vacc1x0 = vacc0x0;
+    __m128i vacc1x1 = vacc0x1;
+    __m128i vacc1x2 = vacc0x2;
+    __m128i vacc1x3 = vacc0x3;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t k = 0;
+    while (k < kc) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      a0 += 8;
+      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+      const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+      a1 += 8;
+
+      const __m128i vb01 = _mm_load_si128((const __m128i*) w);
+      const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
+      const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
+      const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+
+      vacc0x0 = _mm_maddd_epi16(vxa0, vxb0, vacc0x0);
+      vacc0x1 = _mm_maddd_epi16(vxa0, vxb1, vacc0x1);
+      vacc1x0 = _mm_maddd_epi16(vxa1, vxb0, vacc1x0);
+      vacc1x1 = _mm_maddd_epi16(vxa1, vxb1, vacc1x1);
+      const __m128i vb23 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
+      const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
+      const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
+      const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+
+      vacc0x2 = _mm_maddd_epi16(vxa0, vxb2, vacc0x2);
+      vacc0x3 = _mm_maddd_epi16(vxa0, vxb3, vacc0x3);
+      vacc1x2 = _mm_maddd_epi16(vxa1, vxb2, vacc1x2);
+      vacc1x3 = _mm_maddd_epi16(vxa1, vxb3, vacc1x3);
+
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
+      k += 8 * sizeof(int8_t);
+    }
+
+    const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
+    const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
+    const __m128i vacc1x01 = _mm_hadd_epi32(vacc1x0, vacc1x1);
+    const __m128i vacc1x23 = _mm_hadd_epi32(vacc1x2, vacc1x3);
+
+    __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
+    __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+
+    const __m128 vscale0123 = _mm_load_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-gemm/gen/2x4c8-minmax-fp32-xop-ld64.c b/src/qc8-gemm/gen/2x4c8-minmax-fp32-xop-ld64.c
new file mode 100644
index 0000000..7d2cffe
--- /dev/null
+++ b/src/qc8-gemm/gen/2x4c8-minmax-fp32-xop-ld64.c
@@ -0,0 +1,154 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/MRx4c8-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#if defined(__GNUC__) || defined(__clang__)
+  #include <x86intrin.h>
+#else
+  #include <immintrin.h>
+  #include <ammintrin.h>
+#endif
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 2);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+
+  do {
+    __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
+    __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
+    __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
+    __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
+    __m128i vacc1x0 = vacc0x0;
+    __m128i vacc1x1 = vacc0x1;
+    __m128i vacc1x2 = vacc0x2;
+    __m128i vacc1x3 = vacc0x3;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t k = 0;
+    while (k < kc) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      a0 += 8;
+      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+      const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+      a1 += 8;
+
+      const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+      const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
+
+      vacc0x0 = _mm_maddd_epi16(vxa0, vxb0, vacc0x0);
+      vacc1x0 = _mm_maddd_epi16(vxa1, vxb0, vacc1x0);
+      const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int8_t)));
+      const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
+
+      vacc0x1 = _mm_maddd_epi16(vxa0, vxb1, vacc0x1);
+      vacc1x1 = _mm_maddd_epi16(vxa1, vxb1, vacc1x1);
+      const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
+      const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
+
+      vacc0x2 = _mm_maddd_epi16(vxa0, vxb2, vacc0x2);
+      vacc1x2 = _mm_maddd_epi16(vxa1, vxb2, vacc1x2);
+      const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int8_t)));
+      const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
+
+      vacc0x3 = _mm_maddd_epi16(vxa0, vxb3, vacc0x3);
+      vacc1x3 = _mm_maddd_epi16(vxa1, vxb3, vacc1x3);
+
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
+      k += 8 * sizeof(int8_t);
+    }
+
+    const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
+    const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
+    const __m128i vacc1x01 = _mm_hadd_epi32(vacc1x0, vacc1x1);
+    const __m128i vacc1x23 = _mm_hadd_epi32(vacc1x2, vacc1x3);
+
+    __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
+    __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+
+    const __m128 vscale0123 = _mm_load_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-gemm/gen/3x4c2-minmax-fp32-avx-ld128.c b/src/qc8-gemm/gen/3x4c2-minmax-fp32-avx-ld128.c
new file mode 100644
index 0000000..1550ecc
--- /dev/null
+++ b/src/qc8-gemm/gen/3x4c2-minmax-fp32-avx-ld128.c
@@ -0,0 +1,220 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/MRx4c2-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+
+void xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 3);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+
+  do {
+    __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
+    __m128i vacc1x0123 = vacc0x0123;
+    __m128i vacc2x0123 = vacc0x0123;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t k = kc;
+    while (k >= 8 * sizeof(int8_t)) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      a0 += 8;
+      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+      const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+      a1 += 8;
+      const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+      const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
+      a2 += 8;
+
+      const __m128i vb01 = _mm_loadu_si128((const __m128i*) w);
+      const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
+      const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
+      const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      vacc2x0123 = _mm_add_epi32(vacc2x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+      vacc2x0123 = _mm_add_epi32(vacc2x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+      const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
+      const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
+      const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
+      const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+      vacc2x0123 = _mm_add_epi32(vacc2x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+      vacc2x0123 = _mm_add_epi32(vacc2x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
+      k -= 8 * sizeof(int8_t);
+    }
+    if (k != 0) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      a0 = (const int8_t*) ((uintptr_t) a0 + k);
+      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+      const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+      a1 = (const int8_t*) ((uintptr_t) a1 + k);
+      const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+      const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
+      a2 = (const int8_t*) ((uintptr_t) a2 + k);
+
+      const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+      const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
+      w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      vacc2x0123 = _mm_add_epi32(vacc2x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+      if (k > 2 * sizeof(int8_t)) {
+        const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
+        const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
+        w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        vacc2x0123 = _mm_add_epi32(vacc2x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+
+        if (k > 4 * sizeof(int8_t)) {
+          const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
+          const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
+          w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          vacc0x0123 = _mm_add_epi32(vacc0x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+          vacc1x0123 = _mm_add_epi32(vacc1x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+          vacc2x0123 = _mm_add_epi32(vacc2x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+        }
+      }
+    }
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+    __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
+
+    const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+    vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+    vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+    __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
+      *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4);
+        c2 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+        *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-gemm/gen/3x4c2-minmax-fp32-avx-ld64.c b/src/qc8-gemm/gen/3x4c2-minmax-fp32-avx-ld64.c
new file mode 100644
index 0000000..bf95edb
--- /dev/null
+++ b/src/qc8-gemm/gen/3x4c2-minmax-fp32-avx-ld64.c
@@ -0,0 +1,220 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/MRx4c2-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+
+void xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 3);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+
+  do {
+    __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
+    __m128i vacc1x0123 = vacc0x0123;
+    __m128i vacc2x0123 = vacc0x0123;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t k = kc;
+    while (k >= 8 * sizeof(int8_t)) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      a0 += 8;
+      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+      const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+      a1 += 8;
+      const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+      const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
+      a2 += 8;
+
+      const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+      const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      vacc2x0123 = _mm_add_epi32(vacc2x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int8_t)));
+      const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+      vacc2x0123 = _mm_add_epi32(vacc2x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+      const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
+      const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+      vacc2x0123 = _mm_add_epi32(vacc2x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+      const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int8_t)));
+      const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+      vacc2x0123 = _mm_add_epi32(vacc2x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
+      k -= 8 * sizeof(int8_t);
+    }
+    if (k != 0) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      a0 = (const int8_t*) ((uintptr_t) a0 + k);
+      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+      const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+      a1 = (const int8_t*) ((uintptr_t) a1 + k);
+      const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+      const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
+      a2 = (const int8_t*) ((uintptr_t) a2 + k);
+
+      const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+      const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
+      w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      vacc2x0123 = _mm_add_epi32(vacc2x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+      if (k > 2 * sizeof(int8_t)) {
+        const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
+        const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
+        w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        vacc2x0123 = _mm_add_epi32(vacc2x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+
+        if (k > 4 * sizeof(int8_t)) {
+          const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
+          const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
+          w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          vacc0x0123 = _mm_add_epi32(vacc0x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+          vacc1x0123 = _mm_add_epi32(vacc1x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+          vacc2x0123 = _mm_add_epi32(vacc2x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+        }
+      }
+    }
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+    __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
+
+    const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+    vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+    vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+    __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
+      *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4);
+        c2 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+        *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-gemm/gen/3x4c2-minmax-fp32-sse2-ld128.c b/src/qc8-gemm/gen/3x4c2-minmax-fp32-sse2-ld128.c
new file mode 100644
index 0000000..e337bbc
--- /dev/null
+++ b/src/qc8-gemm/gen/3x4c2-minmax-fp32-sse2-ld128.c
@@ -0,0 +1,224 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/MRx4c2-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+
+void xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 3);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+
+  do {
+    __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
+    __m128i vacc1x0123 = vacc0x0123;
+    __m128i vacc2x0123 = vacc0x0123;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t k = kc;
+    while (k >= 8 * sizeof(int8_t)) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_unpacklo_epi8(va0, _mm_cmpgt_epi8(_mm_setzero_si128(), va0));
+      a0 += 8;
+      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+      const __m128i vxa1 = _mm_unpacklo_epi8(va1, _mm_cmpgt_epi8(_mm_setzero_si128(), va1));
+      a1 += 8;
+      const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+      const __m128i vxa2 = _mm_unpacklo_epi8(va2, _mm_cmpgt_epi8(_mm_setzero_si128(), va2));
+      a2 += 8;
+
+      const __m128i vb01 = _mm_loadu_si128((const __m128i*) w);
+      const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
+      const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
+      const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      vacc2x0123 = _mm_add_epi32(vacc2x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+      vacc2x0123 = _mm_add_epi32(vacc2x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+      const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
+      const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
+      const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
+      const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+      vacc2x0123 = _mm_add_epi32(vacc2x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+      vacc2x0123 = _mm_add_epi32(vacc2x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
+      k -= 8 * sizeof(int8_t);
+    }
+    if (k != 0) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_unpacklo_epi8(va0, _mm_cmpgt_epi8(_mm_setzero_si128(), va0));
+      a0 = (const int8_t*) ((uintptr_t) a0 + k);
+      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+      const __m128i vxa1 = _mm_unpacklo_epi8(va1, _mm_cmpgt_epi8(_mm_setzero_si128(), va1));
+      a1 = (const int8_t*) ((uintptr_t) a1 + k);
+      const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+      const __m128i vxa2 = _mm_unpacklo_epi8(va2, _mm_cmpgt_epi8(_mm_setzero_si128(), va2));
+      a2 = (const int8_t*) ((uintptr_t) a2 + k);
+
+      const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+      const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
+      w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      vacc2x0123 = _mm_add_epi32(vacc2x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+      if (k > 2 * sizeof(int8_t)) {
+        const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
+        const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
+        w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        vacc2x0123 = _mm_add_epi32(vacc2x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+
+        if (k > 4 * sizeof(int8_t)) {
+          const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
+          const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
+          w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          vacc0x0123 = _mm_add_epi32(vacc0x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+          vacc1x0123 = _mm_add_epi32(vacc1x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+          vacc2x0123 = _mm_add_epi32(vacc2x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+        }
+      }
+    }
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+    __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
+
+    const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+    vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+    vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+    __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
+
+    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
+    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
+    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
+    vacc22x0123 = _mm_min_epi16(_mm_max_epi16(vacc22x0123, voutput_min), voutput_max);
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
+
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      vout = _mm_srli_si128(vout, 4);
+      *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(vout);
+      vout = _mm_srli_si128(vout, 4);
+      *((uint32_t*) c2) = (uint32_t) _mm_cvtsi128_si32(vout);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4);
+        c2 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
+        *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-gemm/gen/3x4c2-minmax-fp32-sse2-ld64.c b/src/qc8-gemm/gen/3x4c2-minmax-fp32-sse2-ld64.c
new file mode 100644
index 0000000..bb1730c
--- /dev/null
+++ b/src/qc8-gemm/gen/3x4c2-minmax-fp32-sse2-ld64.c
@@ -0,0 +1,224 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/MRx4c2-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+
+void xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 3);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+
+  do {
+    __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
+    __m128i vacc1x0123 = vacc0x0123;
+    __m128i vacc2x0123 = vacc0x0123;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t k = kc;
+    while (k >= 8 * sizeof(int8_t)) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_unpacklo_epi8(va0, _mm_cmpgt_epi8(_mm_setzero_si128(), va0));
+      a0 += 8;
+      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+      const __m128i vxa1 = _mm_unpacklo_epi8(va1, _mm_cmpgt_epi8(_mm_setzero_si128(), va1));
+      a1 += 8;
+      const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+      const __m128i vxa2 = _mm_unpacklo_epi8(va2, _mm_cmpgt_epi8(_mm_setzero_si128(), va2));
+      a2 += 8;
+
+      const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+      const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      vacc2x0123 = _mm_add_epi32(vacc2x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int8_t)));
+      const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+      vacc2x0123 = _mm_add_epi32(vacc2x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+      const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
+      const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+      vacc2x0123 = _mm_add_epi32(vacc2x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+      const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int8_t)));
+      const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+      vacc2x0123 = _mm_add_epi32(vacc2x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
+      k -= 8 * sizeof(int8_t);
+    }
+    if (k != 0) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_unpacklo_epi8(va0, _mm_cmpgt_epi8(_mm_setzero_si128(), va0));
+      a0 = (const int8_t*) ((uintptr_t) a0 + k);
+      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+      const __m128i vxa1 = _mm_unpacklo_epi8(va1, _mm_cmpgt_epi8(_mm_setzero_si128(), va1));
+      a1 = (const int8_t*) ((uintptr_t) a1 + k);
+      const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+      const __m128i vxa2 = _mm_unpacklo_epi8(va2, _mm_cmpgt_epi8(_mm_setzero_si128(), va2));
+      a2 = (const int8_t*) ((uintptr_t) a2 + k);
+
+      const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+      const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
+      w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      vacc2x0123 = _mm_add_epi32(vacc2x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+      if (k > 2 * sizeof(int8_t)) {
+        const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
+        const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
+        w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        vacc2x0123 = _mm_add_epi32(vacc2x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+
+        if (k > 4 * sizeof(int8_t)) {
+          const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
+          const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
+          w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          vacc0x0123 = _mm_add_epi32(vacc0x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+          vacc1x0123 = _mm_add_epi32(vacc1x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+          vacc2x0123 = _mm_add_epi32(vacc2x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+        }
+      }
+    }
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+    __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
+
+    const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+    vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+    vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+    __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
+
+    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
+    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
+    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
+    vacc22x0123 = _mm_min_epi16(_mm_max_epi16(vacc22x0123, voutput_min), voutput_max);
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
+
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      vout = _mm_srli_si128(vout, 4);
+      *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(vout);
+      vout = _mm_srli_si128(vout, 4);
+      *((uint32_t*) c2) = (uint32_t) _mm_cvtsi128_si32(vout);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4);
+        c2 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
+        *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-gemm/gen/3x4c2-minmax-fp32-sse41-ld128.c b/src/qc8-gemm/gen/3x4c2-minmax-fp32-sse41-ld128.c
new file mode 100644
index 0000000..f5f3f1c
--- /dev/null
+++ b/src/qc8-gemm/gen/3x4c2-minmax-fp32-sse41-ld128.c
@@ -0,0 +1,220 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/MRx4c2-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+
+void xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 3);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+
+  do {
+    __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
+    __m128i vacc1x0123 = vacc0x0123;
+    __m128i vacc2x0123 = vacc0x0123;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t k = kc;
+    while (k >= 8 * sizeof(int8_t)) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      a0 += 8;
+      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+      const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+      a1 += 8;
+      const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+      const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
+      a2 += 8;
+
+      const __m128i vb01 = _mm_loadu_si128((const __m128i*) w);
+      const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
+      const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
+      const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      vacc2x0123 = _mm_add_epi32(vacc2x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+      vacc2x0123 = _mm_add_epi32(vacc2x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+      const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
+      const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
+      const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
+      const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+      vacc2x0123 = _mm_add_epi32(vacc2x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+      vacc2x0123 = _mm_add_epi32(vacc2x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
+      k -= 8 * sizeof(int8_t);
+    }
+    if (k != 0) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      a0 = (const int8_t*) ((uintptr_t) a0 + k);
+      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+      const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+      a1 = (const int8_t*) ((uintptr_t) a1 + k);
+      const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+      const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
+      a2 = (const int8_t*) ((uintptr_t) a2 + k);
+
+      const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+      const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
+      w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      vacc2x0123 = _mm_add_epi32(vacc2x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+      if (k > 2 * sizeof(int8_t)) {
+        const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
+        const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
+        w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        vacc2x0123 = _mm_add_epi32(vacc2x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+
+        if (k > 4 * sizeof(int8_t)) {
+          const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
+          const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
+          w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          vacc0x0123 = _mm_add_epi32(vacc0x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+          vacc1x0123 = _mm_add_epi32(vacc1x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+          vacc2x0123 = _mm_add_epi32(vacc2x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+        }
+      }
+    }
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+    __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
+
+    const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+    vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+    vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+    __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
+      *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4);
+        c2 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+        *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-gemm/gen/3x4c2-minmax-fp32-sse41-ld64.c b/src/qc8-gemm/gen/3x4c2-minmax-fp32-sse41-ld64.c
new file mode 100644
index 0000000..042d7b5
--- /dev/null
+++ b/src/qc8-gemm/gen/3x4c2-minmax-fp32-sse41-ld64.c
@@ -0,0 +1,220 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/MRx4c2-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+
+void xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 3);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+
+  do {
+    __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
+    __m128i vacc1x0123 = vacc0x0123;
+    __m128i vacc2x0123 = vacc0x0123;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t k = kc;
+    while (k >= 8 * sizeof(int8_t)) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      a0 += 8;
+      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+      const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+      a1 += 8;
+      const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+      const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
+      a2 += 8;
+
+      const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+      const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      vacc2x0123 = _mm_add_epi32(vacc2x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int8_t)));
+      const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+      vacc2x0123 = _mm_add_epi32(vacc2x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+      const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
+      const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+      vacc2x0123 = _mm_add_epi32(vacc2x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+      const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int8_t)));
+      const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+      vacc2x0123 = _mm_add_epi32(vacc2x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
+      k -= 8 * sizeof(int8_t);
+    }
+    if (k != 0) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      a0 = (const int8_t*) ((uintptr_t) a0 + k);
+      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+      const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+      a1 = (const int8_t*) ((uintptr_t) a1 + k);
+      const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+      const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
+      a2 = (const int8_t*) ((uintptr_t) a2 + k);
+
+      const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+      const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
+      w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      vacc2x0123 = _mm_add_epi32(vacc2x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+      if (k > 2 * sizeof(int8_t)) {
+        const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
+        const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
+        w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        vacc2x0123 = _mm_add_epi32(vacc2x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+
+        if (k > 4 * sizeof(int8_t)) {
+          const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
+          const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
+          w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          vacc0x0123 = _mm_add_epi32(vacc0x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+          vacc1x0123 = _mm_add_epi32(vacc1x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+          vacc2x0123 = _mm_add_epi32(vacc2x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+        }
+      }
+    }
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+    __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
+
+    const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+    vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+    vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+    __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
+      *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4);
+        c2 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+        *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-gemm/gen/3x4c2-minmax-fp32-xop-ld128.c b/src/qc8-gemm/gen/3x4c2-minmax-fp32-xop-ld128.c
new file mode 100644
index 0000000..def9e5e
--- /dev/null
+++ b/src/qc8-gemm/gen/3x4c2-minmax-fp32-xop-ld128.c
@@ -0,0 +1,225 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/MRx4c2-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#if defined(__GNUC__) || defined(__clang__)
+  #include <x86intrin.h>
+#else
+  #include <immintrin.h>
+  #include <ammintrin.h>
+#endif
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+
+void xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 3);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+
+  do {
+    __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
+    __m128i vacc1x0123 = vacc0x0123;
+    __m128i vacc2x0123 = vacc0x0123;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t k = kc;
+    while (k >= 8 * sizeof(int8_t)) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      a0 += 8;
+      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+      const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+      a1 += 8;
+      const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+      const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
+      a2 += 8;
+
+      const __m128i vb01 = _mm_loadu_si128((const __m128i*) w);
+      const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
+      const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
+      const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+
+      vacc0x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
+      vacc1x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc1x0123);
+      vacc2x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc2x0123);
+
+      vacc0x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
+      vacc1x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc1x0123);
+      vacc2x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc2x0123);
+      const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
+      const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
+      const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
+      const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+
+      vacc0x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
+      vacc1x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc1x0123);
+      vacc2x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc2x0123);
+
+      vacc0x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc0x0123);
+      vacc1x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc1x0123);
+      vacc2x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc2x0123);
+
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
+      k -= 8 * sizeof(int8_t);
+    }
+    if (k != 0) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      a0 = (const int8_t*) ((uintptr_t) a0 + k);
+      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+      const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+      a1 = (const int8_t*) ((uintptr_t) a1 + k);
+      const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+      const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
+      a2 = (const int8_t*) ((uintptr_t) a2 + k);
+
+      const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+      const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
+      w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      vacc0x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
+      vacc1x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc1x0123);
+      vacc2x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc2x0123);
+
+      if (k > 2 * sizeof(int8_t)) {
+        const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
+        const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
+        w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        vacc0x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
+        vacc1x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc1x0123);
+        vacc2x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc2x0123);
+
+        if (k > 4 * sizeof(int8_t)) {
+          const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
+          const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
+          w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          vacc0x0123 = _mm_maddd_epi16(
+            _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
+          vacc1x0123 = _mm_maddd_epi16(
+            _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc1x0123);
+          vacc2x0123 = _mm_maddd_epi16(
+            _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc2x0123);
+        }
+      }
+    }
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+    __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
+
+    const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+    vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+    vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+    __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
+      *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4);
+        c2 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+        *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-gemm/gen/3x4c2-minmax-fp32-xop-ld64.c b/src/qc8-gemm/gen/3x4c2-minmax-fp32-xop-ld64.c
new file mode 100644
index 0000000..184754d
--- /dev/null
+++ b/src/qc8-gemm/gen/3x4c2-minmax-fp32-xop-ld64.c
@@ -0,0 +1,225 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/MRx4c2-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#if defined(__GNUC__) || defined(__clang__)
+  #include <x86intrin.h>
+#else
+  #include <immintrin.h>
+  #include <ammintrin.h>
+#endif
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+
+void xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 3);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+
+  do {
+    __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
+    __m128i vacc1x0123 = vacc0x0123;
+    __m128i vacc2x0123 = vacc0x0123;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t k = kc;
+    while (k >= 8 * sizeof(int8_t)) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      a0 += 8;
+      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+      const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+      a1 += 8;
+      const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+      const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
+      a2 += 8;
+
+      const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+      const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
+
+      vacc0x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
+      vacc1x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc1x0123);
+      vacc2x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc2x0123);
+      const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int8_t)));
+      const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
+
+      vacc0x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
+      vacc1x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc1x0123);
+      vacc2x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc2x0123);
+      const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
+      const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
+
+      vacc0x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
+      vacc1x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc1x0123);
+      vacc2x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc2x0123);
+      const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int8_t)));
+      const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
+
+      vacc0x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc0x0123);
+      vacc1x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc1x0123);
+      vacc2x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc2x0123);
+
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
+      k -= 8 * sizeof(int8_t);
+    }
+    if (k != 0) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      a0 = (const int8_t*) ((uintptr_t) a0 + k);
+      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+      const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+      a1 = (const int8_t*) ((uintptr_t) a1 + k);
+      const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+      const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
+      a2 = (const int8_t*) ((uintptr_t) a2 + k);
+
+      const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+      const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
+      w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      vacc0x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
+      vacc1x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc1x0123);
+      vacc2x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc2x0123);
+
+      if (k > 2 * sizeof(int8_t)) {
+        const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
+        const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
+        w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        vacc0x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
+        vacc1x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc1x0123);
+        vacc2x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc2x0123);
+
+        if (k > 4 * sizeof(int8_t)) {
+          const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
+          const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
+          w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          vacc0x0123 = _mm_maddd_epi16(
+            _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
+          vacc1x0123 = _mm_maddd_epi16(
+            _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc1x0123);
+          vacc2x0123 = _mm_maddd_epi16(
+            _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc2x0123);
+        }
+      }
+    }
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+    __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
+
+    const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+    vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+    vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+    __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
+      *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4);
+        c2 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+        *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-gemm/gen/3x4c8-minmax-fp32-avx-ld128.c b/src/qc8-gemm/gen/3x4c8-minmax-fp32-avx-ld128.c
new file mode 100644
index 0000000..5744d2a
--- /dev/null
+++ b/src/qc8-gemm/gen/3x4c8-minmax-fp32-avx-ld128.c
@@ -0,0 +1,177 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/MRx4c8-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 3);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+
+  do {
+    __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
+    __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
+    __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
+    __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
+    __m128i vacc1x0 = vacc0x0;
+    __m128i vacc1x1 = vacc0x1;
+    __m128i vacc1x2 = vacc0x2;
+    __m128i vacc1x3 = vacc0x3;
+    __m128i vacc2x0 = vacc0x0;
+    __m128i vacc2x1 = vacc0x1;
+    __m128i vacc2x2 = vacc0x2;
+    __m128i vacc2x3 = vacc0x3;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t k = 0;
+    while (k < kc) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      a0 += 8;
+      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+      const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+      a1 += 8;
+      const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+      const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
+      a2 += 8;
+
+      const __m128i vb01 = _mm_load_si128((const __m128i*) w);
+      const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
+      const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
+      const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+
+      vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
+      vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
+      vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
+      vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
+      vacc2x0 = _mm_add_epi32(vacc2x0, _mm_madd_epi16(vxa2, vxb0));
+      vacc2x1 = _mm_add_epi32(vacc2x1, _mm_madd_epi16(vxa2, vxb1));
+      const __m128i vb23 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
+      const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
+      const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
+      const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+
+      vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
+      vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
+      vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
+      vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
+      vacc2x2 = _mm_add_epi32(vacc2x2, _mm_madd_epi16(vxa2, vxb2));
+      vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3));
+
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
+      k += 8 * sizeof(int8_t);
+    }
+
+    const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
+    const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
+    const __m128i vacc1x01 = _mm_hadd_epi32(vacc1x0, vacc1x1);
+    const __m128i vacc1x23 = _mm_hadd_epi32(vacc1x2, vacc1x3);
+    const __m128i vacc2x01 = _mm_hadd_epi32(vacc2x0, vacc2x1);
+    const __m128i vacc2x23 = _mm_hadd_epi32(vacc2x2, vacc2x3);
+
+    __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
+    __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
+    __m128i vacc2x0123 = _mm_hadd_epi32(vacc2x01, vacc2x23);
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+    __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
+
+    const __m128 vscale0123 = _mm_load_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+    vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+    vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+    __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
+      *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4);
+        c2 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+        *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-gemm/gen/3x4c8-minmax-fp32-avx-ld64.c b/src/qc8-gemm/gen/3x4c8-minmax-fp32-avx-ld64.c
new file mode 100644
index 0000000..a6246de
--- /dev/null
+++ b/src/qc8-gemm/gen/3x4c8-minmax-fp32-avx-ld64.c
@@ -0,0 +1,179 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/MRx4c8-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 3);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+
+  do {
+    __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
+    __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
+    __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
+    __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
+    __m128i vacc1x0 = vacc0x0;
+    __m128i vacc1x1 = vacc0x1;
+    __m128i vacc1x2 = vacc0x2;
+    __m128i vacc1x3 = vacc0x3;
+    __m128i vacc2x0 = vacc0x0;
+    __m128i vacc2x1 = vacc0x1;
+    __m128i vacc2x2 = vacc0x2;
+    __m128i vacc2x3 = vacc0x3;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t k = 0;
+    while (k < kc) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      a0 += 8;
+      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+      const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+      a1 += 8;
+      const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+      const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
+      a2 += 8;
+
+      const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+      const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
+
+      vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
+      vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
+      vacc2x0 = _mm_add_epi32(vacc2x0, _mm_madd_epi16(vxa2, vxb0));
+      const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int8_t)));
+      const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
+
+      vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
+      vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
+      vacc2x1 = _mm_add_epi32(vacc2x1, _mm_madd_epi16(vxa2, vxb1));
+      const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
+      const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
+
+      vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
+      vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
+      vacc2x2 = _mm_add_epi32(vacc2x2, _mm_madd_epi16(vxa2, vxb2));
+      const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int8_t)));
+      const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
+
+      vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
+      vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
+      vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3));
+
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
+      k += 8 * sizeof(int8_t);
+    }
+
+    const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
+    const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
+    const __m128i vacc1x01 = _mm_hadd_epi32(vacc1x0, vacc1x1);
+    const __m128i vacc1x23 = _mm_hadd_epi32(vacc1x2, vacc1x3);
+    const __m128i vacc2x01 = _mm_hadd_epi32(vacc2x0, vacc2x1);
+    const __m128i vacc2x23 = _mm_hadd_epi32(vacc2x2, vacc2x3);
+
+    __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
+    __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
+    __m128i vacc2x0123 = _mm_hadd_epi32(vacc2x01, vacc2x23);
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+    __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
+
+    const __m128 vscale0123 = _mm_load_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+    vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+    vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+    __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
+      *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4);
+        c2 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+        *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-gemm/gen/3x4c8-minmax-fp32-sse2-ld128.c b/src/qc8-gemm/gen/3x4c8-minmax-fp32-sse2-ld128.c
new file mode 100644
index 0000000..8275917
--- /dev/null
+++ b/src/qc8-gemm/gen/3x4c8-minmax-fp32-sse2-ld128.c
@@ -0,0 +1,181 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/MRx4c8-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 3);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+
+  do {
+    __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
+    __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
+    __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
+    __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
+    __m128i vacc1x0 = vacc0x0;
+    __m128i vacc1x1 = vacc0x1;
+    __m128i vacc1x2 = vacc0x2;
+    __m128i vacc1x3 = vacc0x3;
+    __m128i vacc2x0 = vacc0x0;
+    __m128i vacc2x1 = vacc0x1;
+    __m128i vacc2x2 = vacc0x2;
+    __m128i vacc2x3 = vacc0x3;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t k = 0;
+    while (k < kc) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_unpacklo_epi8(va0, _mm_cmpgt_epi8(_mm_setzero_si128(), va0));
+      a0 += 8;
+      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+      const __m128i vxa1 = _mm_unpacklo_epi8(va1, _mm_cmpgt_epi8(_mm_setzero_si128(), va1));
+      a1 += 8;
+      const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+      const __m128i vxa2 = _mm_unpacklo_epi8(va2, _mm_cmpgt_epi8(_mm_setzero_si128(), va2));
+      a2 += 8;
+
+      const __m128i vb01 = _mm_load_si128((const __m128i*) w);
+      const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
+      const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
+      const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+
+      vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
+      vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
+      vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
+      vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
+      vacc2x0 = _mm_add_epi32(vacc2x0, _mm_madd_epi16(vxa2, vxb0));
+      vacc2x1 = _mm_add_epi32(vacc2x1, _mm_madd_epi16(vxa2, vxb1));
+      const __m128i vb23 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
+      const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
+      const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
+      const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+
+      vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
+      vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
+      vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
+      vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
+      vacc2x2 = _mm_add_epi32(vacc2x2, _mm_madd_epi16(vxa2, vxb2));
+      vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3));
+
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
+      k += 8 * sizeof(int8_t);
+    }
+
+    const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2));
+    const __m128i vacc0x13 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x1, vacc0x3), _mm_unpackhi_epi32(vacc0x1, vacc0x3));
+    const __m128i vacc1x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc1x0, vacc1x2), _mm_unpackhi_epi32(vacc1x0, vacc1x2));
+    const __m128i vacc1x13 = _mm_add_epi32(_mm_unpacklo_epi32(vacc1x1, vacc1x3), _mm_unpackhi_epi32(vacc1x1, vacc1x3));
+    const __m128i vacc2x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc2x0, vacc2x2), _mm_unpackhi_epi32(vacc2x0, vacc2x2));
+    const __m128i vacc2x13 = _mm_add_epi32(_mm_unpacklo_epi32(vacc2x1, vacc2x3), _mm_unpackhi_epi32(vacc2x1, vacc2x3));
+
+    __m128i vacc0x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x02, vacc0x13), _mm_unpackhi_epi32(vacc0x02, vacc0x13));
+    __m128i vacc1x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc1x02, vacc1x13), _mm_unpackhi_epi32(vacc1x02, vacc1x13));
+    __m128i vacc2x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc2x02, vacc2x13), _mm_unpackhi_epi32(vacc2x02, vacc2x13));
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+    __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
+
+    const __m128 vscale0123 = _mm_load_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+    vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+    vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+    __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
+
+    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
+    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
+    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
+    vacc22x0123 = _mm_min_epi16(_mm_max_epi16(vacc22x0123, voutput_min), voutput_max);
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
+
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      vout = _mm_srli_si128(vout, 4);
+      *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(vout);
+      vout = _mm_srli_si128(vout, 4);
+      *((uint32_t*) c2) = (uint32_t) _mm_cvtsi128_si32(vout);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4);
+        c2 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
+        *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-gemm/gen/3x4c8-minmax-fp32-sse2-ld64.c b/src/qc8-gemm/gen/3x4c8-minmax-fp32-sse2-ld64.c
new file mode 100644
index 0000000..80d112b
--- /dev/null
+++ b/src/qc8-gemm/gen/3x4c8-minmax-fp32-sse2-ld64.c
@@ -0,0 +1,183 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/MRx4c8-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 3);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+
+  do {
+    __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
+    __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
+    __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
+    __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
+    __m128i vacc1x0 = vacc0x0;
+    __m128i vacc1x1 = vacc0x1;
+    __m128i vacc1x2 = vacc0x2;
+    __m128i vacc1x3 = vacc0x3;
+    __m128i vacc2x0 = vacc0x0;
+    __m128i vacc2x1 = vacc0x1;
+    __m128i vacc2x2 = vacc0x2;
+    __m128i vacc2x3 = vacc0x3;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t k = 0;
+    while (k < kc) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_unpacklo_epi8(va0, _mm_cmpgt_epi8(_mm_setzero_si128(), va0));
+      a0 += 8;
+      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+      const __m128i vxa1 = _mm_unpacklo_epi8(va1, _mm_cmpgt_epi8(_mm_setzero_si128(), va1));
+      a1 += 8;
+      const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+      const __m128i vxa2 = _mm_unpacklo_epi8(va2, _mm_cmpgt_epi8(_mm_setzero_si128(), va2));
+      a2 += 8;
+
+      const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+      const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
+
+      vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
+      vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
+      vacc2x0 = _mm_add_epi32(vacc2x0, _mm_madd_epi16(vxa2, vxb0));
+      const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int8_t)));
+      const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
+
+      vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
+      vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
+      vacc2x1 = _mm_add_epi32(vacc2x1, _mm_madd_epi16(vxa2, vxb1));
+      const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
+      const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
+
+      vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
+      vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
+      vacc2x2 = _mm_add_epi32(vacc2x2, _mm_madd_epi16(vxa2, vxb2));
+      const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int8_t)));
+      const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
+
+      vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
+      vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
+      vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3));
+
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
+      k += 8 * sizeof(int8_t);
+    }
+
+    const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2));
+    const __m128i vacc0x13 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x1, vacc0x3), _mm_unpackhi_epi32(vacc0x1, vacc0x3));
+    const __m128i vacc1x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc1x0, vacc1x2), _mm_unpackhi_epi32(vacc1x0, vacc1x2));
+    const __m128i vacc1x13 = _mm_add_epi32(_mm_unpacklo_epi32(vacc1x1, vacc1x3), _mm_unpackhi_epi32(vacc1x1, vacc1x3));
+    const __m128i vacc2x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc2x0, vacc2x2), _mm_unpackhi_epi32(vacc2x0, vacc2x2));
+    const __m128i vacc2x13 = _mm_add_epi32(_mm_unpacklo_epi32(vacc2x1, vacc2x3), _mm_unpackhi_epi32(vacc2x1, vacc2x3));
+
+    __m128i vacc0x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x02, vacc0x13), _mm_unpackhi_epi32(vacc0x02, vacc0x13));
+    __m128i vacc1x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc1x02, vacc1x13), _mm_unpackhi_epi32(vacc1x02, vacc1x13));
+    __m128i vacc2x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc2x02, vacc2x13), _mm_unpackhi_epi32(vacc2x02, vacc2x13));
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+    __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
+
+    const __m128 vscale0123 = _mm_load_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+    vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+    vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+    __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
+
+    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
+    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
+    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
+    vacc22x0123 = _mm_min_epi16(_mm_max_epi16(vacc22x0123, voutput_min), voutput_max);
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
+
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      vout = _mm_srli_si128(vout, 4);
+      *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(vout);
+      vout = _mm_srli_si128(vout, 4);
+      *((uint32_t*) c2) = (uint32_t) _mm_cvtsi128_si32(vout);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4);
+        c2 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
+        *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-gemm/gen/3x4c8-minmax-fp32-sse41-ld128.c b/src/qc8-gemm/gen/3x4c8-minmax-fp32-sse41-ld128.c
new file mode 100644
index 0000000..b1209ec
--- /dev/null
+++ b/src/qc8-gemm/gen/3x4c8-minmax-fp32-sse41-ld128.c
@@ -0,0 +1,177 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/MRx4c8-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 3);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+
+  do {
+    __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
+    __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
+    __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
+    __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
+    __m128i vacc1x0 = vacc0x0;
+    __m128i vacc1x1 = vacc0x1;
+    __m128i vacc1x2 = vacc0x2;
+    __m128i vacc1x3 = vacc0x3;
+    __m128i vacc2x0 = vacc0x0;
+    __m128i vacc2x1 = vacc0x1;
+    __m128i vacc2x2 = vacc0x2;
+    __m128i vacc2x3 = vacc0x3;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t k = 0;
+    while (k < kc) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      a0 += 8;
+      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+      const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+      a1 += 8;
+      const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+      const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
+      a2 += 8;
+
+      const __m128i vb01 = _mm_load_si128((const __m128i*) w);
+      const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
+      const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
+      const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+
+      vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
+      vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
+      vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
+      vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
+      vacc2x0 = _mm_add_epi32(vacc2x0, _mm_madd_epi16(vxa2, vxb0));
+      vacc2x1 = _mm_add_epi32(vacc2x1, _mm_madd_epi16(vxa2, vxb1));
+      const __m128i vb23 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
+      const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
+      const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
+      const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+
+      vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
+      vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
+      vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
+      vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
+      vacc2x2 = _mm_add_epi32(vacc2x2, _mm_madd_epi16(vxa2, vxb2));
+      vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3));
+
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
+      k += 8 * sizeof(int8_t);
+    }
+
+    const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
+    const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
+    const __m128i vacc1x01 = _mm_hadd_epi32(vacc1x0, vacc1x1);
+    const __m128i vacc1x23 = _mm_hadd_epi32(vacc1x2, vacc1x3);
+    const __m128i vacc2x01 = _mm_hadd_epi32(vacc2x0, vacc2x1);
+    const __m128i vacc2x23 = _mm_hadd_epi32(vacc2x2, vacc2x3);
+
+    __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
+    __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
+    __m128i vacc2x0123 = _mm_hadd_epi32(vacc2x01, vacc2x23);
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+    __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
+
+    const __m128 vscale0123 = _mm_load_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+    vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+    vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+    __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
+      *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4);
+        c2 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+        *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-gemm/gen/3x4c8-minmax-fp32-sse41-ld64.c b/src/qc8-gemm/gen/3x4c8-minmax-fp32-sse41-ld64.c
new file mode 100644
index 0000000..6d9a6bd
--- /dev/null
+++ b/src/qc8-gemm/gen/3x4c8-minmax-fp32-sse41-ld64.c
@@ -0,0 +1,179 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/MRx4c8-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 3);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+
+  do {
+    __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
+    __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
+    __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
+    __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
+    __m128i vacc1x0 = vacc0x0;
+    __m128i vacc1x1 = vacc0x1;
+    __m128i vacc1x2 = vacc0x2;
+    __m128i vacc1x3 = vacc0x3;
+    __m128i vacc2x0 = vacc0x0;
+    __m128i vacc2x1 = vacc0x1;
+    __m128i vacc2x2 = vacc0x2;
+    __m128i vacc2x3 = vacc0x3;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t k = 0;
+    while (k < kc) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      a0 += 8;
+      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+      const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+      a1 += 8;
+      const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+      const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
+      a2 += 8;
+
+      const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+      const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
+
+      vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
+      vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
+      vacc2x0 = _mm_add_epi32(vacc2x0, _mm_madd_epi16(vxa2, vxb0));
+      const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int8_t)));
+      const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
+
+      vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
+      vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
+      vacc2x1 = _mm_add_epi32(vacc2x1, _mm_madd_epi16(vxa2, vxb1));
+      const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
+      const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
+
+      vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
+      vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
+      vacc2x2 = _mm_add_epi32(vacc2x2, _mm_madd_epi16(vxa2, vxb2));
+      const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int8_t)));
+      const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
+
+      vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
+      vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
+      vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3));
+
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
+      k += 8 * sizeof(int8_t);
+    }
+
+    const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
+    const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
+    const __m128i vacc1x01 = _mm_hadd_epi32(vacc1x0, vacc1x1);
+    const __m128i vacc1x23 = _mm_hadd_epi32(vacc1x2, vacc1x3);
+    const __m128i vacc2x01 = _mm_hadd_epi32(vacc2x0, vacc2x1);
+    const __m128i vacc2x23 = _mm_hadd_epi32(vacc2x2, vacc2x3);
+
+    __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
+    __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
+    __m128i vacc2x0123 = _mm_hadd_epi32(vacc2x01, vacc2x23);
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+    __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
+
+    const __m128 vscale0123 = _mm_load_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+    vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+    vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+    __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
+      *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4);
+        c2 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+        *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-gemm/gen/3x4c8-minmax-fp32-xop-ld128.c b/src/qc8-gemm/gen/3x4c8-minmax-fp32-xop-ld128.c
new file mode 100644
index 0000000..904263f
--- /dev/null
+++ b/src/qc8-gemm/gen/3x4c8-minmax-fp32-xop-ld128.c
@@ -0,0 +1,182 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/MRx4c8-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#if defined(__GNUC__) || defined(__clang__)
+  #include <x86intrin.h>
+#else
+  #include <immintrin.h>
+  #include <ammintrin.h>
+#endif
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 3);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+
+  do {
+    __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
+    __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
+    __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
+    __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
+    __m128i vacc1x0 = vacc0x0;
+    __m128i vacc1x1 = vacc0x1;
+    __m128i vacc1x2 = vacc0x2;
+    __m128i vacc1x3 = vacc0x3;
+    __m128i vacc2x0 = vacc0x0;
+    __m128i vacc2x1 = vacc0x1;
+    __m128i vacc2x2 = vacc0x2;
+    __m128i vacc2x3 = vacc0x3;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t k = 0;
+    while (k < kc) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      a0 += 8;
+      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+      const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+      a1 += 8;
+      const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+      const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
+      a2 += 8;
+
+      const __m128i vb01 = _mm_load_si128((const __m128i*) w);
+      const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
+      const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
+      const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+
+      vacc0x0 = _mm_maddd_epi16(vxa0, vxb0, vacc0x0);
+      vacc0x1 = _mm_maddd_epi16(vxa0, vxb1, vacc0x1);
+      vacc1x0 = _mm_maddd_epi16(vxa1, vxb0, vacc1x0);
+      vacc1x1 = _mm_maddd_epi16(vxa1, vxb1, vacc1x1);
+      vacc2x0 = _mm_maddd_epi16(vxa2, vxb0, vacc2x0);
+      vacc2x1 = _mm_maddd_epi16(vxa2, vxb1, vacc2x1);
+      const __m128i vb23 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
+      const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
+      const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
+      const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+
+      vacc0x2 = _mm_maddd_epi16(vxa0, vxb2, vacc0x2);
+      vacc0x3 = _mm_maddd_epi16(vxa0, vxb3, vacc0x3);
+      vacc1x2 = _mm_maddd_epi16(vxa1, vxb2, vacc1x2);
+      vacc1x3 = _mm_maddd_epi16(vxa1, vxb3, vacc1x3);
+      vacc2x2 = _mm_maddd_epi16(vxa2, vxb2, vacc2x2);
+      vacc2x3 = _mm_maddd_epi16(vxa2, vxb3, vacc2x3);
+
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
+      k += 8 * sizeof(int8_t);
+    }
+
+    const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
+    const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
+    const __m128i vacc1x01 = _mm_hadd_epi32(vacc1x0, vacc1x1);
+    const __m128i vacc1x23 = _mm_hadd_epi32(vacc1x2, vacc1x3);
+    const __m128i vacc2x01 = _mm_hadd_epi32(vacc2x0, vacc2x1);
+    const __m128i vacc2x23 = _mm_hadd_epi32(vacc2x2, vacc2x3);
+
+    __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
+    __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
+    __m128i vacc2x0123 = _mm_hadd_epi32(vacc2x01, vacc2x23);
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+    __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
+
+    const __m128 vscale0123 = _mm_load_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+    vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+    vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+    __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
+      *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4);
+        c2 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+        *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-gemm/gen/3x4c8-minmax-fp32-xop-ld64.c b/src/qc8-gemm/gen/3x4c8-minmax-fp32-xop-ld64.c
new file mode 100644
index 0000000..48bec57
--- /dev/null
+++ b/src/qc8-gemm/gen/3x4c8-minmax-fp32-xop-ld64.c
@@ -0,0 +1,184 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/MRx4c8-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#if defined(__GNUC__) || defined(__clang__)
+  #include <x86intrin.h>
+#else
+  #include <immintrin.h>
+  #include <ammintrin.h>
+#endif
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 3);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+
+  do {
+    __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
+    __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
+    __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
+    __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
+    __m128i vacc1x0 = vacc0x0;
+    __m128i vacc1x1 = vacc0x1;
+    __m128i vacc1x2 = vacc0x2;
+    __m128i vacc1x3 = vacc0x3;
+    __m128i vacc2x0 = vacc0x0;
+    __m128i vacc2x1 = vacc0x1;
+    __m128i vacc2x2 = vacc0x2;
+    __m128i vacc2x3 = vacc0x3;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t k = 0;
+    while (k < kc) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      a0 += 8;
+      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+      const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+      a1 += 8;
+      const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+      const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
+      a2 += 8;
+
+      const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+      const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
+
+      vacc0x0 = _mm_maddd_epi16(vxa0, vxb0, vacc0x0);
+      vacc1x0 = _mm_maddd_epi16(vxa1, vxb0, vacc1x0);
+      vacc2x0 = _mm_maddd_epi16(vxa2, vxb0, vacc2x0);
+      const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int8_t)));
+      const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
+
+      vacc0x1 = _mm_maddd_epi16(vxa0, vxb1, vacc0x1);
+      vacc1x1 = _mm_maddd_epi16(vxa1, vxb1, vacc1x1);
+      vacc2x1 = _mm_maddd_epi16(vxa2, vxb1, vacc2x1);
+      const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
+      const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
+
+      vacc0x2 = _mm_maddd_epi16(vxa0, vxb2, vacc0x2);
+      vacc1x2 = _mm_maddd_epi16(vxa1, vxb2, vacc1x2);
+      vacc2x2 = _mm_maddd_epi16(vxa2, vxb2, vacc2x2);
+      const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int8_t)));
+      const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
+
+      vacc0x3 = _mm_maddd_epi16(vxa0, vxb3, vacc0x3);
+      vacc1x3 = _mm_maddd_epi16(vxa1, vxb3, vacc1x3);
+      vacc2x3 = _mm_maddd_epi16(vxa2, vxb3, vacc2x3);
+
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
+      k += 8 * sizeof(int8_t);
+    }
+
+    const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
+    const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
+    const __m128i vacc1x01 = _mm_hadd_epi32(vacc1x0, vacc1x1);
+    const __m128i vacc1x23 = _mm_hadd_epi32(vacc1x2, vacc1x3);
+    const __m128i vacc2x01 = _mm_hadd_epi32(vacc2x0, vacc2x1);
+    const __m128i vacc2x23 = _mm_hadd_epi32(vacc2x2, vacc2x3);
+
+    __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
+    __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
+    __m128i vacc2x0123 = _mm_hadd_epi32(vacc2x01, vacc2x23);
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+    __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
+
+    const __m128 vscale0123 = _mm_load_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+    vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+    vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+    __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
+      *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4);
+        c2 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+        *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-gemm/gen/4x4c2-minmax-fp32-avx-ld128.c b/src/qc8-gemm/gen/4x4c2-minmax-fp32-avx-ld128.c
new file mode 100644
index 0000000..e18d5d7
--- /dev/null
+++ b/src/qc8-gemm/gen/4x4c2-minmax-fp32-avx-ld128.c
@@ -0,0 +1,256 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/MRx4c2-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+
+void xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 4);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+  const int8_t* a3 = (const int8_t*) ((uintptr_t) a2 + a_stride);
+  int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 4) {
+    a3 = a2;
+    c3 = c2;
+  }
+
+  do {
+    __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
+    __m128i vacc1x0123 = vacc0x0123;
+    __m128i vacc2x0123 = vacc0x0123;
+    __m128i vacc3x0123 = vacc0x0123;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t k = kc;
+    while (k >= 8 * sizeof(int8_t)) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      a0 += 8;
+      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+      const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+      a1 += 8;
+      const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+      const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
+      a2 += 8;
+      const __m128i va3 = _mm_loadl_epi64((const __m128i*) a3);
+      const __m128i vxa3 = _mm_cvtepi8_epi16(va3);
+      a3 += 8;
+
+      const __m128i vb01 = _mm_loadu_si128((const __m128i*) w);
+      const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
+      const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
+      const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      vacc2x0123 = _mm_add_epi32(vacc2x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      vacc3x0123 = _mm_add_epi32(vacc3x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+      vacc2x0123 = _mm_add_epi32(vacc2x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+      vacc3x0123 = _mm_add_epi32(vacc3x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+      const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
+      const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
+      const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
+      const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+      vacc2x0123 = _mm_add_epi32(vacc2x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+      vacc3x0123 = _mm_add_epi32(vacc3x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+      vacc2x0123 = _mm_add_epi32(vacc2x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+      vacc3x0123 = _mm_add_epi32(vacc3x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
+      k -= 8 * sizeof(int8_t);
+    }
+    if (k != 0) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      a0 = (const int8_t*) ((uintptr_t) a0 + k);
+      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+      const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+      a1 = (const int8_t*) ((uintptr_t) a1 + k);
+      const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+      const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
+      a2 = (const int8_t*) ((uintptr_t) a2 + k);
+      const __m128i va3 = _mm_loadl_epi64((const __m128i*) a3);
+      const __m128i vxa3 = _mm_cvtepi8_epi16(va3);
+      a3 = (const int8_t*) ((uintptr_t) a3 + k);
+
+      const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+      const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
+      w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      vacc2x0123 = _mm_add_epi32(vacc2x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      vacc3x0123 = _mm_add_epi32(vacc3x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+      if (k > 2 * sizeof(int8_t)) {
+        const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
+        const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
+        w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        vacc2x0123 = _mm_add_epi32(vacc2x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        vacc3x0123 = _mm_add_epi32(vacc3x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+
+        if (k > 4 * sizeof(int8_t)) {
+          const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
+          const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
+          w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          vacc0x0123 = _mm_add_epi32(vacc0x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+          vacc1x0123 = _mm_add_epi32(vacc1x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+          vacc2x0123 = _mm_add_epi32(vacc2x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+          vacc3x0123 = _mm_add_epi32(vacc3x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+        }
+      }
+    }
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+    __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
+    __m128 vscaled3x0123 = _mm_cvtepi32_ps(vacc3x0123);
+
+    const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+    vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale0123);
+    vscaled3x0123 = _mm_mul_ps(vscaled3x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+    vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
+    vacc3x0123 = _mm_cvtps_epi32(vscaled3x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+    __m128i vacc23x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc3x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc23x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
+      *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
+      *((uint32_t*) c3) = (uint32_t) _mm_extract_epi32(vout, 3);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4);
+        c2 += 2;
+        *((uint16_t*) c3) = (uint16_t) _mm_extract_epi16(vout, 6);
+        c3 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+        *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+        *((int8_t*) c3) = (int8_t) _mm_extract_epi8(vout, 12);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-gemm/gen/4x4c2-minmax-fp32-avx-ld64.c b/src/qc8-gemm/gen/4x4c2-minmax-fp32-avx-ld64.c
new file mode 100644
index 0000000..d6f5fc2
--- /dev/null
+++ b/src/qc8-gemm/gen/4x4c2-minmax-fp32-avx-ld64.c
@@ -0,0 +1,256 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/MRx4c2-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+
+void xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 4);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+  const int8_t* a3 = (const int8_t*) ((uintptr_t) a2 + a_stride);
+  int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 4) {
+    a3 = a2;
+    c3 = c2;
+  }
+
+  do {
+    __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
+    __m128i vacc1x0123 = vacc0x0123;
+    __m128i vacc2x0123 = vacc0x0123;
+    __m128i vacc3x0123 = vacc0x0123;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t k = kc;
+    while (k >= 8 * sizeof(int8_t)) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      a0 += 8;
+      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+      const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+      a1 += 8;
+      const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+      const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
+      a2 += 8;
+      const __m128i va3 = _mm_loadl_epi64((const __m128i*) a3);
+      const __m128i vxa3 = _mm_cvtepi8_epi16(va3);
+      a3 += 8;
+
+      const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+      const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      vacc2x0123 = _mm_add_epi32(vacc2x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      vacc3x0123 = _mm_add_epi32(vacc3x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int8_t)));
+      const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+      vacc2x0123 = _mm_add_epi32(vacc2x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+      vacc3x0123 = _mm_add_epi32(vacc3x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+      const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
+      const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+      vacc2x0123 = _mm_add_epi32(vacc2x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+      vacc3x0123 = _mm_add_epi32(vacc3x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+      const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int8_t)));
+      const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+      vacc2x0123 = _mm_add_epi32(vacc2x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+      vacc3x0123 = _mm_add_epi32(vacc3x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
+      k -= 8 * sizeof(int8_t);
+    }
+    if (k != 0) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      a0 = (const int8_t*) ((uintptr_t) a0 + k);
+      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+      const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+      a1 = (const int8_t*) ((uintptr_t) a1 + k);
+      const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+      const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
+      a2 = (const int8_t*) ((uintptr_t) a2 + k);
+      const __m128i va3 = _mm_loadl_epi64((const __m128i*) a3);
+      const __m128i vxa3 = _mm_cvtepi8_epi16(va3);
+      a3 = (const int8_t*) ((uintptr_t) a3 + k);
+
+      const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+      const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
+      w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      vacc2x0123 = _mm_add_epi32(vacc2x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      vacc3x0123 = _mm_add_epi32(vacc3x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+      if (k > 2 * sizeof(int8_t)) {
+        const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
+        const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
+        w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        vacc2x0123 = _mm_add_epi32(vacc2x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        vacc3x0123 = _mm_add_epi32(vacc3x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+
+        if (k > 4 * sizeof(int8_t)) {
+          const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
+          const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
+          w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          vacc0x0123 = _mm_add_epi32(vacc0x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+          vacc1x0123 = _mm_add_epi32(vacc1x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+          vacc2x0123 = _mm_add_epi32(vacc2x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+          vacc3x0123 = _mm_add_epi32(vacc3x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+        }
+      }
+    }
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+    __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
+    __m128 vscaled3x0123 = _mm_cvtepi32_ps(vacc3x0123);
+
+    const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+    vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale0123);
+    vscaled3x0123 = _mm_mul_ps(vscaled3x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+    vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
+    vacc3x0123 = _mm_cvtps_epi32(vscaled3x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+    __m128i vacc23x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc3x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc23x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
+      *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
+      *((uint32_t*) c3) = (uint32_t) _mm_extract_epi32(vout, 3);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4);
+        c2 += 2;
+        *((uint16_t*) c3) = (uint16_t) _mm_extract_epi16(vout, 6);
+        c3 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+        *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+        *((int8_t*) c3) = (int8_t) _mm_extract_epi8(vout, 12);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-gemm/gen/4x4c2-minmax-fp32-sse2-ld128.c b/src/qc8-gemm/gen/4x4c2-minmax-fp32-sse2-ld128.c
new file mode 100644
index 0000000..0d9da78
--- /dev/null
+++ b/src/qc8-gemm/gen/4x4c2-minmax-fp32-sse2-ld128.c
@@ -0,0 +1,261 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/MRx4c2-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+
+void xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 4);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+  const int8_t* a3 = (const int8_t*) ((uintptr_t) a2 + a_stride);
+  int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 4) {
+    a3 = a2;
+    c3 = c2;
+  }
+
+  do {
+    __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
+    __m128i vacc1x0123 = vacc0x0123;
+    __m128i vacc2x0123 = vacc0x0123;
+    __m128i vacc3x0123 = vacc0x0123;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t k = kc;
+    while (k >= 8 * sizeof(int8_t)) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_unpacklo_epi8(va0, _mm_cmpgt_epi8(_mm_setzero_si128(), va0));
+      a0 += 8;
+      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+      const __m128i vxa1 = _mm_unpacklo_epi8(va1, _mm_cmpgt_epi8(_mm_setzero_si128(), va1));
+      a1 += 8;
+      const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+      const __m128i vxa2 = _mm_unpacklo_epi8(va2, _mm_cmpgt_epi8(_mm_setzero_si128(), va2));
+      a2 += 8;
+      const __m128i va3 = _mm_loadl_epi64((const __m128i*) a3);
+      const __m128i vxa3 = _mm_unpacklo_epi8(va3, _mm_cmpgt_epi8(_mm_setzero_si128(), va3));
+      a3 += 8;
+
+      const __m128i vb01 = _mm_loadu_si128((const __m128i*) w);
+      const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
+      const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
+      const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      vacc2x0123 = _mm_add_epi32(vacc2x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      vacc3x0123 = _mm_add_epi32(vacc3x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+      vacc2x0123 = _mm_add_epi32(vacc2x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+      vacc3x0123 = _mm_add_epi32(vacc3x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+      const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
+      const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
+      const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
+      const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+      vacc2x0123 = _mm_add_epi32(vacc2x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+      vacc3x0123 = _mm_add_epi32(vacc3x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+      vacc2x0123 = _mm_add_epi32(vacc2x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+      vacc3x0123 = _mm_add_epi32(vacc3x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
+      k -= 8 * sizeof(int8_t);
+    }
+    if (k != 0) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_unpacklo_epi8(va0, _mm_cmpgt_epi8(_mm_setzero_si128(), va0));
+      a0 = (const int8_t*) ((uintptr_t) a0 + k);
+      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+      const __m128i vxa1 = _mm_unpacklo_epi8(va1, _mm_cmpgt_epi8(_mm_setzero_si128(), va1));
+      a1 = (const int8_t*) ((uintptr_t) a1 + k);
+      const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+      const __m128i vxa2 = _mm_unpacklo_epi8(va2, _mm_cmpgt_epi8(_mm_setzero_si128(), va2));
+      a2 = (const int8_t*) ((uintptr_t) a2 + k);
+      const __m128i va3 = _mm_loadl_epi64((const __m128i*) a3);
+      const __m128i vxa3 = _mm_unpacklo_epi8(va3, _mm_cmpgt_epi8(_mm_setzero_si128(), va3));
+      a3 = (const int8_t*) ((uintptr_t) a3 + k);
+
+      const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+      const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
+      w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      vacc2x0123 = _mm_add_epi32(vacc2x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      vacc3x0123 = _mm_add_epi32(vacc3x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+      if (k > 2 * sizeof(int8_t)) {
+        const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
+        const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
+        w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        vacc2x0123 = _mm_add_epi32(vacc2x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        vacc3x0123 = _mm_add_epi32(vacc3x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+
+        if (k > 4 * sizeof(int8_t)) {
+          const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
+          const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
+          w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          vacc0x0123 = _mm_add_epi32(vacc0x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+          vacc1x0123 = _mm_add_epi32(vacc1x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+          vacc2x0123 = _mm_add_epi32(vacc2x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+          vacc3x0123 = _mm_add_epi32(vacc3x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+        }
+      }
+    }
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+    __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
+    __m128 vscaled3x0123 = _mm_cvtepi32_ps(vacc3x0123);
+
+    const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+    vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale0123);
+    vscaled3x0123 = _mm_mul_ps(vscaled3x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+    vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
+    vacc3x0123 = _mm_cvtps_epi32(vscaled3x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+    __m128i vacc23x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc3x0123), voutput_zero_point);
+
+    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
+    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
+    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
+    vacc23x0123 = _mm_min_epi16(_mm_max_epi16(vacc23x0123, voutput_min), voutput_max);
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc23x0123);
+
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      vout = _mm_srli_si128(vout, 4);
+      *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(vout);
+      vout = _mm_srli_si128(vout, 4);
+      *((uint32_t*) c2) = (uint32_t) _mm_cvtsi128_si32(vout);
+      vout = _mm_srli_si128(vout, 4);
+      *((uint32_t*) c3) = (uint32_t) _mm_cvtsi128_si32(vout);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4);
+        c2 += 2;
+        *((uint16_t*) c3) = (uint16_t) _mm_extract_epi16(vout, 6);
+        c3 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
+        *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
+        *((int8_t*) c3) = (int8_t) _mm_extract_epi16(vout, 6);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-gemm/gen/4x4c2-minmax-fp32-sse2-ld64.c b/src/qc8-gemm/gen/4x4c2-minmax-fp32-sse2-ld64.c
new file mode 100644
index 0000000..81f3f5c
--- /dev/null
+++ b/src/qc8-gemm/gen/4x4c2-minmax-fp32-sse2-ld64.c
@@ -0,0 +1,261 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/MRx4c2-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+
+void xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 4);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+  const int8_t* a3 = (const int8_t*) ((uintptr_t) a2 + a_stride);
+  int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 4) {
+    a3 = a2;
+    c3 = c2;
+  }
+
+  do {
+    __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
+    __m128i vacc1x0123 = vacc0x0123;
+    __m128i vacc2x0123 = vacc0x0123;
+    __m128i vacc3x0123 = vacc0x0123;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t k = kc;
+    while (k >= 8 * sizeof(int8_t)) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_unpacklo_epi8(va0, _mm_cmpgt_epi8(_mm_setzero_si128(), va0));
+      a0 += 8;
+      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+      const __m128i vxa1 = _mm_unpacklo_epi8(va1, _mm_cmpgt_epi8(_mm_setzero_si128(), va1));
+      a1 += 8;
+      const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+      const __m128i vxa2 = _mm_unpacklo_epi8(va2, _mm_cmpgt_epi8(_mm_setzero_si128(), va2));
+      a2 += 8;
+      const __m128i va3 = _mm_loadl_epi64((const __m128i*) a3);
+      const __m128i vxa3 = _mm_unpacklo_epi8(va3, _mm_cmpgt_epi8(_mm_setzero_si128(), va3));
+      a3 += 8;
+
+      const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+      const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      vacc2x0123 = _mm_add_epi32(vacc2x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      vacc3x0123 = _mm_add_epi32(vacc3x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int8_t)));
+      const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+      vacc2x0123 = _mm_add_epi32(vacc2x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+      vacc3x0123 = _mm_add_epi32(vacc3x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+      const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
+      const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+      vacc2x0123 = _mm_add_epi32(vacc2x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+      vacc3x0123 = _mm_add_epi32(vacc3x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+      const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int8_t)));
+      const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+      vacc2x0123 = _mm_add_epi32(vacc2x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+      vacc3x0123 = _mm_add_epi32(vacc3x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
+      k -= 8 * sizeof(int8_t);
+    }
+    if (k != 0) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_unpacklo_epi8(va0, _mm_cmpgt_epi8(_mm_setzero_si128(), va0));
+      a0 = (const int8_t*) ((uintptr_t) a0 + k);
+      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+      const __m128i vxa1 = _mm_unpacklo_epi8(va1, _mm_cmpgt_epi8(_mm_setzero_si128(), va1));
+      a1 = (const int8_t*) ((uintptr_t) a1 + k);
+      const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+      const __m128i vxa2 = _mm_unpacklo_epi8(va2, _mm_cmpgt_epi8(_mm_setzero_si128(), va2));
+      a2 = (const int8_t*) ((uintptr_t) a2 + k);
+      const __m128i va3 = _mm_loadl_epi64((const __m128i*) a3);
+      const __m128i vxa3 = _mm_unpacklo_epi8(va3, _mm_cmpgt_epi8(_mm_setzero_si128(), va3));
+      a3 = (const int8_t*) ((uintptr_t) a3 + k);
+
+      const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+      const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
+      w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      vacc2x0123 = _mm_add_epi32(vacc2x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      vacc3x0123 = _mm_add_epi32(vacc3x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+      if (k > 2 * sizeof(int8_t)) {
+        const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
+        const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
+        w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        vacc2x0123 = _mm_add_epi32(vacc2x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        vacc3x0123 = _mm_add_epi32(vacc3x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+
+        if (k > 4 * sizeof(int8_t)) {
+          const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
+          const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
+          w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          vacc0x0123 = _mm_add_epi32(vacc0x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+          vacc1x0123 = _mm_add_epi32(vacc1x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+          vacc2x0123 = _mm_add_epi32(vacc2x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+          vacc3x0123 = _mm_add_epi32(vacc3x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+        }
+      }
+    }
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+    __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
+    __m128 vscaled3x0123 = _mm_cvtepi32_ps(vacc3x0123);
+
+    const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+    vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale0123);
+    vscaled3x0123 = _mm_mul_ps(vscaled3x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+    vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
+    vacc3x0123 = _mm_cvtps_epi32(vscaled3x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+    __m128i vacc23x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc3x0123), voutput_zero_point);
+
+    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
+    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
+    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
+    vacc23x0123 = _mm_min_epi16(_mm_max_epi16(vacc23x0123, voutput_min), voutput_max);
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc23x0123);
+
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      vout = _mm_srli_si128(vout, 4);
+      *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(vout);
+      vout = _mm_srli_si128(vout, 4);
+      *((uint32_t*) c2) = (uint32_t) _mm_cvtsi128_si32(vout);
+      vout = _mm_srli_si128(vout, 4);
+      *((uint32_t*) c3) = (uint32_t) _mm_cvtsi128_si32(vout);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4);
+        c2 += 2;
+        *((uint16_t*) c3) = (uint16_t) _mm_extract_epi16(vout, 6);
+        c3 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
+        *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
+        *((int8_t*) c3) = (int8_t) _mm_extract_epi16(vout, 6);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-gemm/gen/4x4c2-minmax-fp32-sse41-ld128.c b/src/qc8-gemm/gen/4x4c2-minmax-fp32-sse41-ld128.c
new file mode 100644
index 0000000..f9b111d
--- /dev/null
+++ b/src/qc8-gemm/gen/4x4c2-minmax-fp32-sse41-ld128.c
@@ -0,0 +1,256 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/MRx4c2-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+
+void xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 4);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+  const int8_t* a3 = (const int8_t*) ((uintptr_t) a2 + a_stride);
+  int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 4) {
+    a3 = a2;
+    c3 = c2;
+  }
+
+  do {
+    __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
+    __m128i vacc1x0123 = vacc0x0123;
+    __m128i vacc2x0123 = vacc0x0123;
+    __m128i vacc3x0123 = vacc0x0123;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t k = kc;
+    while (k >= 8 * sizeof(int8_t)) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      a0 += 8;
+      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+      const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+      a1 += 8;
+      const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+      const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
+      a2 += 8;
+      const __m128i va3 = _mm_loadl_epi64((const __m128i*) a3);
+      const __m128i vxa3 = _mm_cvtepi8_epi16(va3);
+      a3 += 8;
+
+      const __m128i vb01 = _mm_loadu_si128((const __m128i*) w);
+      const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
+      const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
+      const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      vacc2x0123 = _mm_add_epi32(vacc2x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      vacc3x0123 = _mm_add_epi32(vacc3x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+      vacc2x0123 = _mm_add_epi32(vacc2x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+      vacc3x0123 = _mm_add_epi32(vacc3x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+      const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
+      const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
+      const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
+      const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+      vacc2x0123 = _mm_add_epi32(vacc2x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+      vacc3x0123 = _mm_add_epi32(vacc3x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+      vacc2x0123 = _mm_add_epi32(vacc2x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+      vacc3x0123 = _mm_add_epi32(vacc3x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
+      k -= 8 * sizeof(int8_t);
+    }
+    if (k != 0) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      a0 = (const int8_t*) ((uintptr_t) a0 + k);
+      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+      const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+      a1 = (const int8_t*) ((uintptr_t) a1 + k);
+      const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+      const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
+      a2 = (const int8_t*) ((uintptr_t) a2 + k);
+      const __m128i va3 = _mm_loadl_epi64((const __m128i*) a3);
+      const __m128i vxa3 = _mm_cvtepi8_epi16(va3);
+      a3 = (const int8_t*) ((uintptr_t) a3 + k);
+
+      const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+      const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
+      w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      vacc2x0123 = _mm_add_epi32(vacc2x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      vacc3x0123 = _mm_add_epi32(vacc3x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+      if (k > 2 * sizeof(int8_t)) {
+        const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
+        const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
+        w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        vacc2x0123 = _mm_add_epi32(vacc2x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        vacc3x0123 = _mm_add_epi32(vacc3x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+
+        if (k > 4 * sizeof(int8_t)) {
+          const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
+          const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
+          w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          vacc0x0123 = _mm_add_epi32(vacc0x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+          vacc1x0123 = _mm_add_epi32(vacc1x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+          vacc2x0123 = _mm_add_epi32(vacc2x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+          vacc3x0123 = _mm_add_epi32(vacc3x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+        }
+      }
+    }
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+    __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
+    __m128 vscaled3x0123 = _mm_cvtepi32_ps(vacc3x0123);
+
+    const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+    vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale0123);
+    vscaled3x0123 = _mm_mul_ps(vscaled3x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+    vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
+    vacc3x0123 = _mm_cvtps_epi32(vscaled3x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+    __m128i vacc23x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc3x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc23x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
+      *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
+      *((uint32_t*) c3) = (uint32_t) _mm_extract_epi32(vout, 3);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4);
+        c2 += 2;
+        *((uint16_t*) c3) = (uint16_t) _mm_extract_epi16(vout, 6);
+        c3 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+        *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+        *((int8_t*) c3) = (int8_t) _mm_extract_epi8(vout, 12);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-gemm/gen/4x4c2-minmax-fp32-sse41-ld64.c b/src/qc8-gemm/gen/4x4c2-minmax-fp32-sse41-ld64.c
new file mode 100644
index 0000000..965ef36
--- /dev/null
+++ b/src/qc8-gemm/gen/4x4c2-minmax-fp32-sse41-ld64.c
@@ -0,0 +1,256 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/MRx4c2-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+
+void xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 4);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+  const int8_t* a3 = (const int8_t*) ((uintptr_t) a2 + a_stride);
+  int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 4) {
+    a3 = a2;
+    c3 = c2;
+  }
+
+  do {
+    __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
+    __m128i vacc1x0123 = vacc0x0123;
+    __m128i vacc2x0123 = vacc0x0123;
+    __m128i vacc3x0123 = vacc0x0123;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t k = kc;
+    while (k >= 8 * sizeof(int8_t)) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      a0 += 8;
+      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+      const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+      a1 += 8;
+      const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+      const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
+      a2 += 8;
+      const __m128i va3 = _mm_loadl_epi64((const __m128i*) a3);
+      const __m128i vxa3 = _mm_cvtepi8_epi16(va3);
+      a3 += 8;
+
+      const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+      const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      vacc2x0123 = _mm_add_epi32(vacc2x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      vacc3x0123 = _mm_add_epi32(vacc3x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int8_t)));
+      const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+      vacc2x0123 = _mm_add_epi32(vacc2x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+      vacc3x0123 = _mm_add_epi32(vacc3x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+      const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
+      const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+      vacc2x0123 = _mm_add_epi32(vacc2x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+      vacc3x0123 = _mm_add_epi32(vacc3x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+      const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int8_t)));
+      const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+      vacc2x0123 = _mm_add_epi32(vacc2x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+      vacc3x0123 = _mm_add_epi32(vacc3x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
+      k -= 8 * sizeof(int8_t);
+    }
+    if (k != 0) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      a0 = (const int8_t*) ((uintptr_t) a0 + k);
+      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+      const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+      a1 = (const int8_t*) ((uintptr_t) a1 + k);
+      const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+      const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
+      a2 = (const int8_t*) ((uintptr_t) a2 + k);
+      const __m128i va3 = _mm_loadl_epi64((const __m128i*) a3);
+      const __m128i vxa3 = _mm_cvtepi8_epi16(va3);
+      a3 = (const int8_t*) ((uintptr_t) a3 + k);
+
+      const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+      const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
+      w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      vacc0x0123 = _mm_add_epi32(vacc0x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      vacc1x0123 = _mm_add_epi32(vacc1x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      vacc2x0123 = _mm_add_epi32(vacc2x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+      vacc3x0123 = _mm_add_epi32(vacc3x0123,
+        _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+      if (k > 2 * sizeof(int8_t)) {
+        const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
+        const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
+        w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        vacc2x0123 = _mm_add_epi32(vacc2x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        vacc3x0123 = _mm_add_epi32(vacc3x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+
+        if (k > 4 * sizeof(int8_t)) {
+          const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
+          const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
+          w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          vacc0x0123 = _mm_add_epi32(vacc0x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+          vacc1x0123 = _mm_add_epi32(vacc1x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+          vacc2x0123 = _mm_add_epi32(vacc2x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+          vacc3x0123 = _mm_add_epi32(vacc3x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+        }
+      }
+    }
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+    __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
+    __m128 vscaled3x0123 = _mm_cvtepi32_ps(vacc3x0123);
+
+    const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+    vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale0123);
+    vscaled3x0123 = _mm_mul_ps(vscaled3x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+    vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
+    vacc3x0123 = _mm_cvtps_epi32(vscaled3x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+    __m128i vacc23x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc3x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc23x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
+      *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
+      *((uint32_t*) c3) = (uint32_t) _mm_extract_epi32(vout, 3);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4);
+        c2 += 2;
+        *((uint16_t*) c3) = (uint16_t) _mm_extract_epi16(vout, 6);
+        c3 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+        *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+        *((int8_t*) c3) = (int8_t) _mm_extract_epi8(vout, 12);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-gemm/gen/4x4c2-minmax-fp32-xop-ld128.c b/src/qc8-gemm/gen/4x4c2-minmax-fp32-xop-ld128.c
new file mode 100644
index 0000000..f0b662e
--- /dev/null
+++ b/src/qc8-gemm/gen/4x4c2-minmax-fp32-xop-ld128.c
@@ -0,0 +1,261 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/MRx4c2-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#if defined(__GNUC__) || defined(__clang__)
+  #include <x86intrin.h>
+#else
+  #include <immintrin.h>
+  #include <ammintrin.h>
+#endif
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+
+void xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 4);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+  const int8_t* a3 = (const int8_t*) ((uintptr_t) a2 + a_stride);
+  int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 4) {
+    a3 = a2;
+    c3 = c2;
+  }
+
+  do {
+    __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
+    __m128i vacc1x0123 = vacc0x0123;
+    __m128i vacc2x0123 = vacc0x0123;
+    __m128i vacc3x0123 = vacc0x0123;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t k = kc;
+    while (k >= 8 * sizeof(int8_t)) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      a0 += 8;
+      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+      const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+      a1 += 8;
+      const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+      const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
+      a2 += 8;
+      const __m128i va3 = _mm_loadl_epi64((const __m128i*) a3);
+      const __m128i vxa3 = _mm_cvtepi8_epi16(va3);
+      a3 += 8;
+
+      const __m128i vb01 = _mm_loadu_si128((const __m128i*) w);
+      const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
+      const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
+      const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+
+      vacc0x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
+      vacc1x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc1x0123);
+      vacc2x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc2x0123);
+      vacc3x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc3x0123);
+
+      vacc0x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
+      vacc1x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc1x0123);
+      vacc2x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc2x0123);
+      vacc3x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc3x0123);
+      const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
+      const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
+      const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
+      const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+
+      vacc0x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
+      vacc1x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc1x0123);
+      vacc2x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc2x0123);
+      vacc3x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc3x0123);
+
+      vacc0x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc0x0123);
+      vacc1x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc1x0123);
+      vacc2x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc2x0123);
+      vacc3x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc3x0123);
+
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
+      k -= 8 * sizeof(int8_t);
+    }
+    if (k != 0) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      a0 = (const int8_t*) ((uintptr_t) a0 + k);
+      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+      const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+      a1 = (const int8_t*) ((uintptr_t) a1 + k);
+      const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+      const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
+      a2 = (const int8_t*) ((uintptr_t) a2 + k);
+      const __m128i va3 = _mm_loadl_epi64((const __m128i*) a3);
+      const __m128i vxa3 = _mm_cvtepi8_epi16(va3);
+      a3 = (const int8_t*) ((uintptr_t) a3 + k);
+
+      const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+      const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
+      w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      vacc0x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
+      vacc1x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc1x0123);
+      vacc2x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc2x0123);
+      vacc3x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc3x0123);
+
+      if (k > 2 * sizeof(int8_t)) {
+        const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
+        const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
+        w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        vacc0x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
+        vacc1x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc1x0123);
+        vacc2x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc2x0123);
+        vacc3x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc3x0123);
+
+        if (k > 4 * sizeof(int8_t)) {
+          const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
+          const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
+          w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          vacc0x0123 = _mm_maddd_epi16(
+            _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
+          vacc1x0123 = _mm_maddd_epi16(
+            _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc1x0123);
+          vacc2x0123 = _mm_maddd_epi16(
+            _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc2x0123);
+          vacc3x0123 = _mm_maddd_epi16(
+            _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc3x0123);
+        }
+      }
+    }
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+    __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
+    __m128 vscaled3x0123 = _mm_cvtepi32_ps(vacc3x0123);
+
+    const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+    vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale0123);
+    vscaled3x0123 = _mm_mul_ps(vscaled3x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+    vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
+    vacc3x0123 = _mm_cvtps_epi32(vscaled3x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+    __m128i vacc23x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc3x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc23x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
+      *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
+      *((uint32_t*) c3) = (uint32_t) _mm_extract_epi32(vout, 3);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4);
+        c2 += 2;
+        *((uint16_t*) c3) = (uint16_t) _mm_extract_epi16(vout, 6);
+        c3 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+        *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+        *((int8_t*) c3) = (int8_t) _mm_extract_epi8(vout, 12);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-gemm/gen/4x4c2-minmax-fp32-xop-ld64.c b/src/qc8-gemm/gen/4x4c2-minmax-fp32-xop-ld64.c
new file mode 100644
index 0000000..ca51f86
--- /dev/null
+++ b/src/qc8-gemm/gen/4x4c2-minmax-fp32-xop-ld64.c
@@ -0,0 +1,261 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/MRx4c2-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#if defined(__GNUC__) || defined(__clang__)
+  #include <x86intrin.h>
+#else
+  #include <immintrin.h>
+  #include <ammintrin.h>
+#endif
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+
+void xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 4);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+  const int8_t* a3 = (const int8_t*) ((uintptr_t) a2 + a_stride);
+  int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 4) {
+    a3 = a2;
+    c3 = c2;
+  }
+
+  do {
+    __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
+    __m128i vacc1x0123 = vacc0x0123;
+    __m128i vacc2x0123 = vacc0x0123;
+    __m128i vacc3x0123 = vacc0x0123;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t k = kc;
+    while (k >= 8 * sizeof(int8_t)) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      a0 += 8;
+      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+      const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+      a1 += 8;
+      const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+      const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
+      a2 += 8;
+      const __m128i va3 = _mm_loadl_epi64((const __m128i*) a3);
+      const __m128i vxa3 = _mm_cvtepi8_epi16(va3);
+      a3 += 8;
+
+      const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+      const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
+
+      vacc0x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
+      vacc1x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc1x0123);
+      vacc2x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc2x0123);
+      vacc3x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc3x0123);
+      const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int8_t)));
+      const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
+
+      vacc0x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
+      vacc1x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc1x0123);
+      vacc2x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc2x0123);
+      vacc3x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc3x0123);
+      const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
+      const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
+
+      vacc0x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
+      vacc1x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc1x0123);
+      vacc2x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc2x0123);
+      vacc3x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc3x0123);
+      const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int8_t)));
+      const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
+
+      vacc0x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc0x0123);
+      vacc1x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc1x0123);
+      vacc2x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc2x0123);
+      vacc3x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc3x0123);
+
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
+      k -= 8 * sizeof(int8_t);
+    }
+    if (k != 0) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      a0 = (const int8_t*) ((uintptr_t) a0 + k);
+      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+      const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+      a1 = (const int8_t*) ((uintptr_t) a1 + k);
+      const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+      const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
+      a2 = (const int8_t*) ((uintptr_t) a2 + k);
+      const __m128i va3 = _mm_loadl_epi64((const __m128i*) a3);
+      const __m128i vxa3 = _mm_cvtepi8_epi16(va3);
+      a3 = (const int8_t*) ((uintptr_t) a3 + k);
+
+      const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+      const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
+      w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+      vacc0x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
+      vacc1x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc1x0123);
+      vacc2x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc2x0123);
+      vacc3x0123 = _mm_maddd_epi16(
+        _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc3x0123);
+
+      if (k > 2 * sizeof(int8_t)) {
+        const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
+        const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
+        w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+        vacc0x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
+        vacc1x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc1x0123);
+        vacc2x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc2x0123);
+        vacc3x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc3x0123);
+
+        if (k > 4 * sizeof(int8_t)) {
+          const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
+          const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
+          w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+          vacc0x0123 = _mm_maddd_epi16(
+            _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
+          vacc1x0123 = _mm_maddd_epi16(
+            _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc1x0123);
+          vacc2x0123 = _mm_maddd_epi16(
+            _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc2x0123);
+          vacc3x0123 = _mm_maddd_epi16(
+            _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc3x0123);
+        }
+      }
+    }
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+    __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
+    __m128 vscaled3x0123 = _mm_cvtepi32_ps(vacc3x0123);
+
+    const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+    vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale0123);
+    vscaled3x0123 = _mm_mul_ps(vscaled3x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+    vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
+    vacc3x0123 = _mm_cvtps_epi32(vscaled3x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+    __m128i vacc23x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc3x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc23x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
+      *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
+      *((uint32_t*) c3) = (uint32_t) _mm_extract_epi32(vout, 3);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4);
+        c2 += 2;
+        *((uint16_t*) c3) = (uint16_t) _mm_extract_epi16(vout, 6);
+        c3 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+        *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+        *((int8_t*) c3) = (int8_t) _mm_extract_epi8(vout, 12);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-igemm/gen/1x4c2-minmax-fp32-avx-ld128.c b/src/qc8-igemm/gen/1x4c2-minmax-fp32-avx-ld128.c
new file mode 100644
index 0000000..578845e
--- /dev/null
+++ b/src/qc8-igemm/gen/1x4c2-minmax-fp32-avx-ld128.c
@@ -0,0 +1,158 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/MRx4c2-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 1);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (1 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  int8_t* c0 = c;
+
+  do {
+    __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      a += 1;
+
+      size_t k = kc;
+      while (k >= 8 * sizeof(int8_t)) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+        a0 += 8;
+
+        const __m128i vb01 = _mm_loadu_si128((const __m128i*) w);
+        const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
+        const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
+        const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16));
+        const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
+        const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
+        const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+
+        w = (const void*) ((uintptr_t) w + 32);
+        k -= 8 * sizeof(int8_t);
+      }
+      if (k != 0) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+        a0 = (const int8_t*) ((uintptr_t) a0 + k);
+
+        const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+        w = (const void*) ((uintptr_t) w + 8);
+        const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+        if (k > 2 * sizeof(int8_t)) {
+          const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
+          w = (const void*) ((uintptr_t) w + 8);
+          const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
+
+          vacc0x0123 = _mm_add_epi32(vacc0x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+
+          if (k > 4 * sizeof(int8_t)) {
+            const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
+            w = (const void*) ((uintptr_t) w + 8);
+            const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
+
+            vacc0x0123 = _mm_add_epi32(vacc0x0123,
+              _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+          }
+        }
+      }
+      p -= 1 * sizeof(void*);
+    } while (p != 0);
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+
+    const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-igemm/gen/1x4c2-minmax-fp32-avx-ld64.c b/src/qc8-igemm/gen/1x4c2-minmax-fp32-avx-ld64.c
new file mode 100644
index 0000000..e05d50a
--- /dev/null
+++ b/src/qc8-igemm/gen/1x4c2-minmax-fp32-avx-ld64.c
@@ -0,0 +1,158 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/MRx4c2-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 1);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (1 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  int8_t* c0 = c;
+
+  do {
+    __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      a += 1;
+
+      size_t k = kc;
+      while (k >= 8 * sizeof(int8_t)) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+        a0 += 8;
+
+        const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+        const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8));
+        const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16));
+        const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+        const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24));
+        const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+
+        w = (const void*) ((uintptr_t) w + 32);
+        k -= 8 * sizeof(int8_t);
+      }
+      if (k != 0) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+        a0 = (const int8_t*) ((uintptr_t) a0 + k);
+
+        const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+        w = (const void*) ((uintptr_t) w + 8);
+        const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+        if (k > 2 * sizeof(int8_t)) {
+          const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
+          w = (const void*) ((uintptr_t) w + 8);
+          const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
+
+          vacc0x0123 = _mm_add_epi32(vacc0x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+
+          if (k > 4 * sizeof(int8_t)) {
+            const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
+            w = (const void*) ((uintptr_t) w + 8);
+            const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
+
+            vacc0x0123 = _mm_add_epi32(vacc0x0123,
+              _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+          }
+        }
+      }
+      p -= 1 * sizeof(void*);
+    } while (p != 0);
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+
+    const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-igemm/gen/1x4c2-minmax-fp32-sse2-ld128.c b/src/qc8-igemm/gen/1x4c2-minmax-fp32-sse2-ld128.c
new file mode 100644
index 0000000..5d1bab5
--- /dev/null
+++ b/src/qc8-igemm/gen/1x4c2-minmax-fp32-sse2-ld128.c
@@ -0,0 +1,159 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/MRx4c2-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 1);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (1 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  int8_t* c0 = c;
+
+  do {
+    __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      a += 1;
+
+      size_t k = kc;
+      while (k >= 8 * sizeof(int8_t)) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_unpacklo_epi8(va0, _mm_cmpgt_epi8(_mm_setzero_si128(), va0));
+        a0 += 8;
+
+        const __m128i vb01 = _mm_loadu_si128((const __m128i*) w);
+        const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
+        const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
+        const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16));
+        const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
+        const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
+        const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+
+        w = (const void*) ((uintptr_t) w + 32);
+        k -= 8 * sizeof(int8_t);
+      }
+      if (k != 0) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_unpacklo_epi8(va0, _mm_cmpgt_epi8(_mm_setzero_si128(), va0));
+        a0 = (const int8_t*) ((uintptr_t) a0 + k);
+
+        const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+        w = (const void*) ((uintptr_t) w + 8);
+        const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+        if (k > 2 * sizeof(int8_t)) {
+          const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
+          w = (const void*) ((uintptr_t) w + 8);
+          const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
+
+          vacc0x0123 = _mm_add_epi32(vacc0x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+
+          if (k > 4 * sizeof(int8_t)) {
+            const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
+            w = (const void*) ((uintptr_t) w + 8);
+            const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
+
+            vacc0x0123 = _mm_add_epi32(vacc0x0123,
+              _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+          }
+        }
+      }
+      p -= 1 * sizeof(void*);
+    } while (p != 0);
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+
+    const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
+
+    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
+    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
+    vacc00x0123 = _mm_min_epi16(_mm_max_epi16(vacc00x0123, voutput_min), voutput_max);
+
+    __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
+
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-igemm/gen/1x4c2-minmax-fp32-sse2-ld64.c b/src/qc8-igemm/gen/1x4c2-minmax-fp32-sse2-ld64.c
new file mode 100644
index 0000000..1bac56b
--- /dev/null
+++ b/src/qc8-igemm/gen/1x4c2-minmax-fp32-sse2-ld64.c
@@ -0,0 +1,159 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/MRx4c2-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 1);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (1 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  int8_t* c0 = c;
+
+  do {
+    __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      a += 1;
+
+      size_t k = kc;
+      while (k >= 8 * sizeof(int8_t)) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_unpacklo_epi8(va0, _mm_cmpgt_epi8(_mm_setzero_si128(), va0));
+        a0 += 8;
+
+        const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+        const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8));
+        const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16));
+        const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+        const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24));
+        const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+
+        w = (const void*) ((uintptr_t) w + 32);
+        k -= 8 * sizeof(int8_t);
+      }
+      if (k != 0) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_unpacklo_epi8(va0, _mm_cmpgt_epi8(_mm_setzero_si128(), va0));
+        a0 = (const int8_t*) ((uintptr_t) a0 + k);
+
+        const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+        w = (const void*) ((uintptr_t) w + 8);
+        const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+        if (k > 2 * sizeof(int8_t)) {
+          const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
+          w = (const void*) ((uintptr_t) w + 8);
+          const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
+
+          vacc0x0123 = _mm_add_epi32(vacc0x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+
+          if (k > 4 * sizeof(int8_t)) {
+            const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
+            w = (const void*) ((uintptr_t) w + 8);
+            const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
+
+            vacc0x0123 = _mm_add_epi32(vacc0x0123,
+              _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+          }
+        }
+      }
+      p -= 1 * sizeof(void*);
+    } while (p != 0);
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+
+    const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
+
+    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
+    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
+    vacc00x0123 = _mm_min_epi16(_mm_max_epi16(vacc00x0123, voutput_min), voutput_max);
+
+    __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
+
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-igemm/gen/1x4c2-minmax-fp32-sse41-ld128.c b/src/qc8-igemm/gen/1x4c2-minmax-fp32-sse41-ld128.c
new file mode 100644
index 0000000..9feb8ff
--- /dev/null
+++ b/src/qc8-igemm/gen/1x4c2-minmax-fp32-sse41-ld128.c
@@ -0,0 +1,158 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/MRx4c2-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 1);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (1 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  int8_t* c0 = c;
+
+  do {
+    __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      a += 1;
+
+      size_t k = kc;
+      while (k >= 8 * sizeof(int8_t)) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+        a0 += 8;
+
+        const __m128i vb01 = _mm_loadu_si128((const __m128i*) w);
+        const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
+        const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
+        const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16));
+        const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
+        const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
+        const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+
+        w = (const void*) ((uintptr_t) w + 32);
+        k -= 8 * sizeof(int8_t);
+      }
+      if (k != 0) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+        a0 = (const int8_t*) ((uintptr_t) a0 + k);
+
+        const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+        w = (const void*) ((uintptr_t) w + 8);
+        const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+        if (k > 2 * sizeof(int8_t)) {
+          const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
+          w = (const void*) ((uintptr_t) w + 8);
+          const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
+
+          vacc0x0123 = _mm_add_epi32(vacc0x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+
+          if (k > 4 * sizeof(int8_t)) {
+            const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
+            w = (const void*) ((uintptr_t) w + 8);
+            const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
+
+            vacc0x0123 = _mm_add_epi32(vacc0x0123,
+              _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+          }
+        }
+      }
+      p -= 1 * sizeof(void*);
+    } while (p != 0);
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+
+    const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-igemm/gen/1x4c2-minmax-fp32-sse41-ld64.c b/src/qc8-igemm/gen/1x4c2-minmax-fp32-sse41-ld64.c
new file mode 100644
index 0000000..096cc50
--- /dev/null
+++ b/src/qc8-igemm/gen/1x4c2-minmax-fp32-sse41-ld64.c
@@ -0,0 +1,158 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/MRx4c2-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 1);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (1 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  int8_t* c0 = c;
+
+  do {
+    __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      a += 1;
+
+      size_t k = kc;
+      while (k >= 8 * sizeof(int8_t)) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+        a0 += 8;
+
+        const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+        const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8));
+        const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16));
+        const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+        const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24));
+        const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+
+        w = (const void*) ((uintptr_t) w + 32);
+        k -= 8 * sizeof(int8_t);
+      }
+      if (k != 0) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+        a0 = (const int8_t*) ((uintptr_t) a0 + k);
+
+        const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+        w = (const void*) ((uintptr_t) w + 8);
+        const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+        if (k > 2 * sizeof(int8_t)) {
+          const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
+          w = (const void*) ((uintptr_t) w + 8);
+          const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
+
+          vacc0x0123 = _mm_add_epi32(vacc0x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+
+          if (k > 4 * sizeof(int8_t)) {
+            const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
+            w = (const void*) ((uintptr_t) w + 8);
+            const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
+
+            vacc0x0123 = _mm_add_epi32(vacc0x0123,
+              _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+          }
+        }
+      }
+      p -= 1 * sizeof(void*);
+    } while (p != 0);
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+
+    const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-igemm/gen/1x4c2-minmax-fp32-xop-ld128.c b/src/qc8-igemm/gen/1x4c2-minmax-fp32-xop-ld128.c
new file mode 100644
index 0000000..8563c33
--- /dev/null
+++ b/src/qc8-igemm/gen/1x4c2-minmax-fp32-xop-ld128.c
@@ -0,0 +1,163 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/MRx4c2-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#if defined(__GNUC__) || defined(__clang__)
+  #include <x86intrin.h>
+#else
+  #include <immintrin.h>
+  #include <ammintrin.h>
+#endif
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 1);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (1 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  int8_t* c0 = c;
+
+  do {
+    __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      a += 1;
+
+      size_t k = kc;
+      while (k >= 8 * sizeof(int8_t)) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+        a0 += 8;
+
+        const __m128i vb01 = _mm_loadu_si128((const __m128i*) w);
+        const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
+        const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
+        const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+
+        vacc0x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
+
+        vacc0x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
+        const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16));
+        const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
+        const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
+        const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+
+        vacc0x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
+
+        vacc0x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc0x0123);
+
+        w = (const void*) ((uintptr_t) w + 32);
+        k -= 8 * sizeof(int8_t);
+      }
+      if (k != 0) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+        a0 = (const int8_t*) ((uintptr_t) a0 + k);
+
+        const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+        w = (const void*) ((uintptr_t) w + 8);
+        const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
+
+        vacc0x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
+
+        if (k > 2 * sizeof(int8_t)) {
+          const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
+          w = (const void*) ((uintptr_t) w + 8);
+          const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
+
+          vacc0x0123 = _mm_maddd_epi16(
+            _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
+
+          if (k > 4 * sizeof(int8_t)) {
+            const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
+            w = (const void*) ((uintptr_t) w + 8);
+            const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
+
+            vacc0x0123 = _mm_maddd_epi16(
+              _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
+          }
+        }
+      }
+      p -= 1 * sizeof(void*);
+    } while (p != 0);
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+
+    const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-igemm/gen/1x4c2-minmax-fp32-xop-ld64.c b/src/qc8-igemm/gen/1x4c2-minmax-fp32-xop-ld64.c
new file mode 100644
index 0000000..5f73380
--- /dev/null
+++ b/src/qc8-igemm/gen/1x4c2-minmax-fp32-xop-ld64.c
@@ -0,0 +1,163 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/MRx4c2-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#if defined(__GNUC__) || defined(__clang__)
+  #include <x86intrin.h>
+#else
+  #include <immintrin.h>
+  #include <ammintrin.h>
+#endif
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 1);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (1 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  int8_t* c0 = c;
+
+  do {
+    __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      a += 1;
+
+      size_t k = kc;
+      while (k >= 8 * sizeof(int8_t)) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+        a0 += 8;
+
+        const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+        const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
+
+        vacc0x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
+        const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8));
+        const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
+
+        vacc0x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
+        const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16));
+        const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
+
+        vacc0x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
+        const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24));
+        const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
+
+        vacc0x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc0x0123);
+
+        w = (const void*) ((uintptr_t) w + 32);
+        k -= 8 * sizeof(int8_t);
+      }
+      if (k != 0) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+        a0 = (const int8_t*) ((uintptr_t) a0 + k);
+
+        const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+        w = (const void*) ((uintptr_t) w + 8);
+        const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
+
+        vacc0x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
+
+        if (k > 2 * sizeof(int8_t)) {
+          const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
+          w = (const void*) ((uintptr_t) w + 8);
+          const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
+
+          vacc0x0123 = _mm_maddd_epi16(
+            _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
+
+          if (k > 4 * sizeof(int8_t)) {
+            const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
+            w = (const void*) ((uintptr_t) w + 8);
+            const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
+
+            vacc0x0123 = _mm_maddd_epi16(
+              _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
+          }
+        }
+      }
+      p -= 1 * sizeof(void*);
+    } while (p != 0);
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+
+    const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-igemm/gen/1x4c8-minmax-fp32-avx-ld128.c b/src/qc8-igemm/gen/1x4c8-minmax-fp32-avx-ld128.c
new file mode 100644
index 0000000..11f4972
--- /dev/null
+++ b/src/qc8-igemm/gen/1x4c8-minmax-fp32-avx-ld128.c
@@ -0,0 +1,130 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/MRx4c8-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 1);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (1 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  int8_t* c0 = c;
+
+  do {
+    __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
+    __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
+    __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
+    __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      a += 1;
+
+      size_t k = 0;
+      while (k < kc) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+        a0 += 8;
+
+        const __m128i vb01 = _mm_load_si128((const __m128i*) w);
+        const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
+        const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
+        const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+
+        vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
+        vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
+        const __m128i vb23 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16));
+        const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
+        const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
+        const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+
+        vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
+        vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
+
+        w = (const void*) ((uintptr_t) w + 32);
+        k += 8 * sizeof(int8_t);
+      }
+      p -= 1 * sizeof(void*);
+    } while (p != 0);
+
+    const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
+    const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
+
+    __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+
+    const __m128 vscale0123 = _mm_load_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-igemm/gen/1x4c8-minmax-fp32-avx-ld64.c b/src/qc8-igemm/gen/1x4c8-minmax-fp32-avx-ld64.c
new file mode 100644
index 0000000..ff80586
--- /dev/null
+++ b/src/qc8-igemm/gen/1x4c8-minmax-fp32-avx-ld64.c
@@ -0,0 +1,132 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/MRx4c8-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 1);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (1 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  int8_t* c0 = c;
+
+  do {
+    __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
+    __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
+    __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
+    __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      a += 1;
+
+      size_t k = 0;
+      while (k < kc) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+        a0 += 8;
+
+        const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+        const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
+
+        vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
+        const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8));
+        const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
+
+        vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
+        const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16));
+        const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
+
+        vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
+        const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24));
+        const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
+
+        vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
+
+        w = (const void*) ((uintptr_t) w + 32);
+        k += 8 * sizeof(int8_t);
+      }
+      p -= 1 * sizeof(void*);
+    } while (p != 0);
+
+    const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
+    const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
+
+    __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+
+    const __m128 vscale0123 = _mm_load_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-igemm/gen/1x4c8-minmax-fp32-sse2-ld128.c b/src/qc8-igemm/gen/1x4c8-minmax-fp32-sse2-ld128.c
new file mode 100644
index 0000000..4e8c01d
--- /dev/null
+++ b/src/qc8-igemm/gen/1x4c8-minmax-fp32-sse2-ld128.c
@@ -0,0 +1,131 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/MRx4c8-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 1);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (1 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  int8_t* c0 = c;
+
+  do {
+    __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
+    __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
+    __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
+    __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      a += 1;
+
+      size_t k = 0;
+      while (k < kc) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_unpacklo_epi8(va0, _mm_cmpgt_epi8(_mm_setzero_si128(), va0));
+        a0 += 8;
+
+        const __m128i vb01 = _mm_load_si128((const __m128i*) w);
+        const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
+        const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
+        const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+
+        vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
+        vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
+        const __m128i vb23 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16));
+        const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
+        const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
+        const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+
+        vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
+        vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
+
+        w = (const void*) ((uintptr_t) w + 32);
+        k += 8 * sizeof(int8_t);
+      }
+      p -= 1 * sizeof(void*);
+    } while (p != 0);
+
+    const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2));
+    const __m128i vacc0x13 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x1, vacc0x3), _mm_unpackhi_epi32(vacc0x1, vacc0x3));
+
+    __m128i vacc0x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x02, vacc0x13), _mm_unpackhi_epi32(vacc0x02, vacc0x13));
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+
+    const __m128 vscale0123 = _mm_load_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
+
+    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
+    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
+    vacc00x0123 = _mm_min_epi16(_mm_max_epi16(vacc00x0123, voutput_min), voutput_max);
+
+    __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
+
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-igemm/gen/1x4c8-minmax-fp32-sse2-ld64.c b/src/qc8-igemm/gen/1x4c8-minmax-fp32-sse2-ld64.c
new file mode 100644
index 0000000..7820121
--- /dev/null
+++ b/src/qc8-igemm/gen/1x4c8-minmax-fp32-sse2-ld64.c
@@ -0,0 +1,133 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/MRx4c8-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 1);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (1 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  int8_t* c0 = c;
+
+  do {
+    __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
+    __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
+    __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
+    __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      a += 1;
+
+      size_t k = 0;
+      while (k < kc) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_unpacklo_epi8(va0, _mm_cmpgt_epi8(_mm_setzero_si128(), va0));
+        a0 += 8;
+
+        const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+        const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
+
+        vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
+        const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8));
+        const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
+
+        vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
+        const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16));
+        const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
+
+        vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
+        const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24));
+        const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
+
+        vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
+
+        w = (const void*) ((uintptr_t) w + 32);
+        k += 8 * sizeof(int8_t);
+      }
+      p -= 1 * sizeof(void*);
+    } while (p != 0);
+
+    const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2));
+    const __m128i vacc0x13 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x1, vacc0x3), _mm_unpackhi_epi32(vacc0x1, vacc0x3));
+
+    __m128i vacc0x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x02, vacc0x13), _mm_unpackhi_epi32(vacc0x02, vacc0x13));
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+
+    const __m128 vscale0123 = _mm_load_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
+
+    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
+    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
+    vacc00x0123 = _mm_min_epi16(_mm_max_epi16(vacc00x0123, voutput_min), voutput_max);
+
+    __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
+
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-igemm/gen/1x4c8-minmax-fp32-sse41-ld128.c b/src/qc8-igemm/gen/1x4c8-minmax-fp32-sse41-ld128.c
new file mode 100644
index 0000000..5200238
--- /dev/null
+++ b/src/qc8-igemm/gen/1x4c8-minmax-fp32-sse41-ld128.c
@@ -0,0 +1,130 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/MRx4c8-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 1);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (1 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  int8_t* c0 = c;
+
+  do {
+    __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
+    __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
+    __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
+    __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      a += 1;
+
+      size_t k = 0;
+      while (k < kc) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+        a0 += 8;
+
+        const __m128i vb01 = _mm_load_si128((const __m128i*) w);
+        const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
+        const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
+        const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+
+        vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
+        vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
+        const __m128i vb23 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16));
+        const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
+        const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
+        const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+
+        vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
+        vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
+
+        w = (const void*) ((uintptr_t) w + 32);
+        k += 8 * sizeof(int8_t);
+      }
+      p -= 1 * sizeof(void*);
+    } while (p != 0);
+
+    const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
+    const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
+
+    __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+
+    const __m128 vscale0123 = _mm_load_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-igemm/gen/1x4c8-minmax-fp32-sse41-ld64.c b/src/qc8-igemm/gen/1x4c8-minmax-fp32-sse41-ld64.c
new file mode 100644
index 0000000..4376e65
--- /dev/null
+++ b/src/qc8-igemm/gen/1x4c8-minmax-fp32-sse41-ld64.c
@@ -0,0 +1,132 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/MRx4c8-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 1);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (1 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  int8_t* c0 = c;
+
+  do {
+    __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
+    __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
+    __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
+    __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      a += 1;
+
+      size_t k = 0;
+      while (k < kc) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+        a0 += 8;
+
+        const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+        const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
+
+        vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
+        const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8));
+        const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
+
+        vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
+        const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16));
+        const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
+
+        vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
+        const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24));
+        const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
+
+        vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
+
+        w = (const void*) ((uintptr_t) w + 32);
+        k += 8 * sizeof(int8_t);
+      }
+      p -= 1 * sizeof(void*);
+    } while (p != 0);
+
+    const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
+    const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
+
+    __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+
+    const __m128 vscale0123 = _mm_load_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-igemm/gen/1x4c8-minmax-fp32-xop-ld128.c b/src/qc8-igemm/gen/1x4c8-minmax-fp32-xop-ld128.c
new file mode 100644
index 0000000..3cbf83e
--- /dev/null
+++ b/src/qc8-igemm/gen/1x4c8-minmax-fp32-xop-ld128.c
@@ -0,0 +1,135 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/MRx4c8-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#if defined(__GNUC__) || defined(__clang__)
+  #include <x86intrin.h>
+#else
+  #include <immintrin.h>
+  #include <ammintrin.h>
+#endif
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld128(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 1);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (1 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  int8_t* c0 = c;
+
+  do {
+    __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
+    __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
+    __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
+    __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      a += 1;
+
+      size_t k = 0;
+      while (k < kc) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+        a0 += 8;
+
+        const __m128i vb01 = _mm_load_si128((const __m128i*) w);
+        const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
+        const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
+        const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+
+        vacc0x0 = _mm_maddd_epi16(vxa0, vxb0, vacc0x0);
+        vacc0x1 = _mm_maddd_epi16(vxa0, vxb1, vacc0x1);
+        const __m128i vb23 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16));
+        const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
+        const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
+        const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+
+        vacc0x2 = _mm_maddd_epi16(vxa0, vxb2, vacc0x2);
+        vacc0x3 = _mm_maddd_epi16(vxa0, vxb3, vacc0x3);
+
+        w = (const void*) ((uintptr_t) w + 32);
+        k += 8 * sizeof(int8_t);
+      }
+      p -= 1 * sizeof(void*);
+    } while (p != 0);
+
+    const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
+    const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
+
+    __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+
+    const __m128 vscale0123 = _mm_load_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-igemm/gen/1x4c8-minmax-fp32-xop-ld64.c b/src/qc8-igemm/gen/1x4c8-minmax-fp32-xop-ld64.c
new file mode 100644
index 0000000..e028b42
--- /dev/null
+++ b/src/qc8-igemm/gen/1x4c8-minmax-fp32-xop-ld64.c
@@ -0,0 +1,137 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/MRx4c8-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#if defined(__GNUC__) || defined(__clang__)
+  #include <x86intrin.h>
+#else
+  #include <immintrin.h>
+  #include <ammintrin.h>
+#endif
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 1);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (1 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  int8_t* c0 = c;
+
+  do {
+    __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
+    __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
+    __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
+    __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      a += 1;
+
+      size_t k = 0;
+      while (k < kc) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+        a0 += 8;
+
+        const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+        const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
+
+        vacc0x0 = _mm_maddd_epi16(vxa0, vxb0, vacc0x0);
+        const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8));
+        const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
+
+        vacc0x1 = _mm_maddd_epi16(vxa0, vxb1, vacc0x1);
+        const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16));
+        const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
+
+        vacc0x2 = _mm_maddd_epi16(vxa0, vxb2, vacc0x2);
+        const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24));
+        const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
+
+        vacc0x3 = _mm_maddd_epi16(vxa0, vxb3, vacc0x3);
+
+        w = (const void*) ((uintptr_t) w + 32);
+        k += 8 * sizeof(int8_t);
+      }
+      p -= 1 * sizeof(void*);
+    } while (p != 0);
+
+    const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
+    const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
+
+    __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+
+    const __m128 vscale0123 = _mm_load_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-igemm/gen/2x4c2-minmax-fp32-avx-ld128.c b/src/qc8-igemm/gen/2x4c2-minmax-fp32-avx-ld128.c
new file mode 100644
index 0000000..c24cae1
--- /dev/null
+++ b/src/qc8-igemm/gen/2x4c2-minmax-fp32-avx-ld128.c
@@ -0,0 +1,195 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/MRx4c2-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld128(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 2);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (2 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 2) {
+    c1 = c0;
+  }
+
+  do {
+    __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
+    __m128i vacc1x0123 = vacc0x0123;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      a += 2;
+
+      size_t k = kc;
+      while (k >= 8 * sizeof(int8_t)) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+        a0 += 8;
+        const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+        const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+        a1 += 8;
+
+        const __m128i vb01 = _mm_loadu_si128((const __m128i*) w);
+        const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
+        const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
+        const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16));
+        const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
+        const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
+        const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+
+        w = (const void*) ((uintptr_t) w + 32);
+        k -= 8 * sizeof(int8_t);
+      }
+      if (k != 0) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+        a0 = (const int8_t*) ((uintptr_t) a0 + k);
+        const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+        const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+        a1 = (const int8_t*) ((uintptr_t) a1 + k);
+
+        const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+        w = (const void*) ((uintptr_t) w + 8);
+        const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+        if (k > 2 * sizeof(int8_t)) {
+          const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
+          w = (const void*) ((uintptr_t) w + 8);
+          const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
+
+          vacc0x0123 = _mm_add_epi32(vacc0x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+          vacc1x0123 = _mm_add_epi32(vacc1x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+
+          if (k > 4 * sizeof(int8_t)) {
+            const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
+            w = (const void*) ((uintptr_t) w + 8);
+            const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
+
+            vacc0x0123 = _mm_add_epi32(vacc0x0123,
+              _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+            vacc1x0123 = _mm_add_epi32(vacc1x0123,
+              _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+          }
+        }
+      }
+      p -= 2 * sizeof(void*);
+    } while (p != 0);
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+
+    const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-igemm/gen/2x4c2-minmax-fp32-avx-ld64.c b/src/qc8-igemm/gen/2x4c2-minmax-fp32-avx-ld64.c
new file mode 100644
index 0000000..84c1823
--- /dev/null
+++ b/src/qc8-igemm/gen/2x4c2-minmax-fp32-avx-ld64.c
@@ -0,0 +1,195 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/MRx4c2-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld64(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 2);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (2 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 2) {
+    c1 = c0;
+  }
+
+  do {
+    __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
+    __m128i vacc1x0123 = vacc0x0123;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      a += 2;
+
+      size_t k = kc;
+      while (k >= 8 * sizeof(int8_t)) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+        a0 += 8;
+        const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+        const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+        a1 += 8;
+
+        const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+        const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8));
+        const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16));
+        const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+        const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24));
+        const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+
+        w = (const void*) ((uintptr_t) w + 32);
+        k -= 8 * sizeof(int8_t);
+      }
+      if (k != 0) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+        a0 = (const int8_t*) ((uintptr_t) a0 + k);
+        const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+        const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+        a1 = (const int8_t*) ((uintptr_t) a1 + k);
+
+        const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+        w = (const void*) ((uintptr_t) w + 8);
+        const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+        if (k > 2 * sizeof(int8_t)) {
+          const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
+          w = (const void*) ((uintptr_t) w + 8);
+          const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
+
+          vacc0x0123 = _mm_add_epi32(vacc0x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+          vacc1x0123 = _mm_add_epi32(vacc1x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+
+          if (k > 4 * sizeof(int8_t)) {
+            const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
+            w = (const void*) ((uintptr_t) w + 8);
+            const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
+
+            vacc0x0123 = _mm_add_epi32(vacc0x0123,
+              _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+            vacc1x0123 = _mm_add_epi32(vacc1x0123,
+              _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+          }
+        }
+      }
+      p -= 2 * sizeof(void*);
+    } while (p != 0);
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+
+    const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-igemm/gen/2x4c2-minmax-fp32-sse2-ld128.c b/src/qc8-igemm/gen/2x4c2-minmax-fp32-sse2-ld128.c
new file mode 100644
index 0000000..42c06c0
--- /dev/null
+++ b/src/qc8-igemm/gen/2x4c2-minmax-fp32-sse2-ld128.c
@@ -0,0 +1,196 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/MRx4c2-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld128(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 2);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (2 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 2) {
+    c1 = c0;
+  }
+
+  do {
+    __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
+    __m128i vacc1x0123 = vacc0x0123;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      a += 2;
+
+      size_t k = kc;
+      while (k >= 8 * sizeof(int8_t)) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_unpacklo_epi8(va0, _mm_cmpgt_epi8(_mm_setzero_si128(), va0));
+        a0 += 8;
+        const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+        const __m128i vxa1 = _mm_unpacklo_epi8(va1, _mm_cmpgt_epi8(_mm_setzero_si128(), va1));
+        a1 += 8;
+
+        const __m128i vb01 = _mm_loadu_si128((const __m128i*) w);
+        const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
+        const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
+        const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16));
+        const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
+        const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
+        const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+
+        w = (const void*) ((uintptr_t) w + 32);
+        k -= 8 * sizeof(int8_t);
+      }
+      if (k != 0) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_unpacklo_epi8(va0, _mm_cmpgt_epi8(_mm_setzero_si128(), va0));
+        a0 = (const int8_t*) ((uintptr_t) a0 + k);
+        const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+        const __m128i vxa1 = _mm_unpacklo_epi8(va1, _mm_cmpgt_epi8(_mm_setzero_si128(), va1));
+        a1 = (const int8_t*) ((uintptr_t) a1 + k);
+
+        const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+        w = (const void*) ((uintptr_t) w + 8);
+        const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+        if (k > 2 * sizeof(int8_t)) {
+          const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
+          w = (const void*) ((uintptr_t) w + 8);
+          const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
+
+          vacc0x0123 = _mm_add_epi32(vacc0x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+          vacc1x0123 = _mm_add_epi32(vacc1x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+
+          if (k > 4 * sizeof(int8_t)) {
+            const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
+            w = (const void*) ((uintptr_t) w + 8);
+            const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
+
+            vacc0x0123 = _mm_add_epi32(vacc0x0123,
+              _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+            vacc1x0123 = _mm_add_epi32(vacc1x0123,
+              _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+          }
+        }
+      }
+      p -= 2 * sizeof(void*);
+    } while (p != 0);
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+
+    const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+
+    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
+    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
+    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
+
+
+    if (nc >= 4) {
+      *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(_mm_shuffle_epi32(vout, _MM_SHUFFLE(1, 1, 1, 1)));
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
+        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-igemm/gen/2x4c2-minmax-fp32-sse2-ld64.c b/src/qc8-igemm/gen/2x4c2-minmax-fp32-sse2-ld64.c
new file mode 100644
index 0000000..1684f7f
--- /dev/null
+++ b/src/qc8-igemm/gen/2x4c2-minmax-fp32-sse2-ld64.c
@@ -0,0 +1,196 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/MRx4c2-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld64(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 2);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (2 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 2) {
+    c1 = c0;
+  }
+
+  do {
+    __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
+    __m128i vacc1x0123 = vacc0x0123;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      a += 2;
+
+      size_t k = kc;
+      while (k >= 8 * sizeof(int8_t)) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_unpacklo_epi8(va0, _mm_cmpgt_epi8(_mm_setzero_si128(), va0));
+        a0 += 8;
+        const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+        const __m128i vxa1 = _mm_unpacklo_epi8(va1, _mm_cmpgt_epi8(_mm_setzero_si128(), va1));
+        a1 += 8;
+
+        const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+        const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8));
+        const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16));
+        const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+        const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24));
+        const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+
+        w = (const void*) ((uintptr_t) w + 32);
+        k -= 8 * sizeof(int8_t);
+      }
+      if (k != 0) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_unpacklo_epi8(va0, _mm_cmpgt_epi8(_mm_setzero_si128(), va0));
+        a0 = (const int8_t*) ((uintptr_t) a0 + k);
+        const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+        const __m128i vxa1 = _mm_unpacklo_epi8(va1, _mm_cmpgt_epi8(_mm_setzero_si128(), va1));
+        a1 = (const int8_t*) ((uintptr_t) a1 + k);
+
+        const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+        w = (const void*) ((uintptr_t) w + 8);
+        const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+        if (k > 2 * sizeof(int8_t)) {
+          const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
+          w = (const void*) ((uintptr_t) w + 8);
+          const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
+
+          vacc0x0123 = _mm_add_epi32(vacc0x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+          vacc1x0123 = _mm_add_epi32(vacc1x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+
+          if (k > 4 * sizeof(int8_t)) {
+            const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
+            w = (const void*) ((uintptr_t) w + 8);
+            const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
+
+            vacc0x0123 = _mm_add_epi32(vacc0x0123,
+              _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+            vacc1x0123 = _mm_add_epi32(vacc1x0123,
+              _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+          }
+        }
+      }
+      p -= 2 * sizeof(void*);
+    } while (p != 0);
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+
+    const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+
+    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
+    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
+    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
+
+
+    if (nc >= 4) {
+      *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(_mm_shuffle_epi32(vout, _MM_SHUFFLE(1, 1, 1, 1)));
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
+        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-igemm/gen/2x4c2-minmax-fp32-sse41-ld128.c b/src/qc8-igemm/gen/2x4c2-minmax-fp32-sse41-ld128.c
new file mode 100644
index 0000000..1fda902
--- /dev/null
+++ b/src/qc8-igemm/gen/2x4c2-minmax-fp32-sse41-ld128.c
@@ -0,0 +1,195 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/MRx4c2-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld128(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 2);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (2 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 2) {
+    c1 = c0;
+  }
+
+  do {
+    __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
+    __m128i vacc1x0123 = vacc0x0123;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      a += 2;
+
+      size_t k = kc;
+      while (k >= 8 * sizeof(int8_t)) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+        a0 += 8;
+        const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+        const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+        a1 += 8;
+
+        const __m128i vb01 = _mm_loadu_si128((const __m128i*) w);
+        const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
+        const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
+        const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16));
+        const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
+        const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
+        const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+
+        w = (const void*) ((uintptr_t) w + 32);
+        k -= 8 * sizeof(int8_t);
+      }
+      if (k != 0) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+        a0 = (const int8_t*) ((uintptr_t) a0 + k);
+        const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+        const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+        a1 = (const int8_t*) ((uintptr_t) a1 + k);
+
+        const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+        w = (const void*) ((uintptr_t) w + 8);
+        const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+        if (k > 2 * sizeof(int8_t)) {
+          const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
+          w = (const void*) ((uintptr_t) w + 8);
+          const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
+
+          vacc0x0123 = _mm_add_epi32(vacc0x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+          vacc1x0123 = _mm_add_epi32(vacc1x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+
+          if (k > 4 * sizeof(int8_t)) {
+            const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
+            w = (const void*) ((uintptr_t) w + 8);
+            const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
+
+            vacc0x0123 = _mm_add_epi32(vacc0x0123,
+              _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+            vacc1x0123 = _mm_add_epi32(vacc1x0123,
+              _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+          }
+        }
+      }
+      p -= 2 * sizeof(void*);
+    } while (p != 0);
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+
+    const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-igemm/gen/2x4c2-minmax-fp32-sse41-ld64.c b/src/qc8-igemm/gen/2x4c2-minmax-fp32-sse41-ld64.c
new file mode 100644
index 0000000..6d7af73
--- /dev/null
+++ b/src/qc8-igemm/gen/2x4c2-minmax-fp32-sse41-ld64.c
@@ -0,0 +1,195 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/MRx4c2-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld64(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 2);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (2 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 2) {
+    c1 = c0;
+  }
+
+  do {
+    __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
+    __m128i vacc1x0123 = vacc0x0123;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      a += 2;
+
+      size_t k = kc;
+      while (k >= 8 * sizeof(int8_t)) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+        a0 += 8;
+        const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+        const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+        a1 += 8;
+
+        const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+        const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8));
+        const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16));
+        const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+        const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24));
+        const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+
+        w = (const void*) ((uintptr_t) w + 32);
+        k -= 8 * sizeof(int8_t);
+      }
+      if (k != 0) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+        a0 = (const int8_t*) ((uintptr_t) a0 + k);
+        const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+        const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+        a1 = (const int8_t*) ((uintptr_t) a1 + k);
+
+        const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+        w = (const void*) ((uintptr_t) w + 8);
+        const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+        if (k > 2 * sizeof(int8_t)) {
+          const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
+          w = (const void*) ((uintptr_t) w + 8);
+          const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
+
+          vacc0x0123 = _mm_add_epi32(vacc0x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+          vacc1x0123 = _mm_add_epi32(vacc1x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+
+          if (k > 4 * sizeof(int8_t)) {
+            const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
+            w = (const void*) ((uintptr_t) w + 8);
+            const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
+
+            vacc0x0123 = _mm_add_epi32(vacc0x0123,
+              _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+            vacc1x0123 = _mm_add_epi32(vacc1x0123,
+              _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+          }
+        }
+      }
+      p -= 2 * sizeof(void*);
+    } while (p != 0);
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+
+    const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-igemm/gen/2x4c2-minmax-fp32-xop-ld128.c b/src/qc8-igemm/gen/2x4c2-minmax-fp32-xop-ld128.c
new file mode 100644
index 0000000..da7704e
--- /dev/null
+++ b/src/qc8-igemm/gen/2x4c2-minmax-fp32-xop-ld128.c
@@ -0,0 +1,200 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/MRx4c2-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#if defined(__GNUC__) || defined(__clang__)
+  #include <x86intrin.h>
+#else
+  #include <immintrin.h>
+  #include <ammintrin.h>
+#endif
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld128(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 2);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (2 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 2) {
+    c1 = c0;
+  }
+
+  do {
+    __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
+    __m128i vacc1x0123 = vacc0x0123;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      a += 2;
+
+      size_t k = kc;
+      while (k >= 8 * sizeof(int8_t)) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+        a0 += 8;
+        const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+        const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+        a1 += 8;
+
+        const __m128i vb01 = _mm_loadu_si128((const __m128i*) w);
+        const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
+        const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
+        const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+
+        vacc0x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
+        vacc1x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc1x0123);
+
+        vacc0x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
+        vacc1x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc1x0123);
+        const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16));
+        const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
+        const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
+        const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+
+        vacc0x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
+        vacc1x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc1x0123);
+
+        vacc0x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc0x0123);
+        vacc1x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc1x0123);
+
+        w = (const void*) ((uintptr_t) w + 32);
+        k -= 8 * sizeof(int8_t);
+      }
+      if (k != 0) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+        a0 = (const int8_t*) ((uintptr_t) a0 + k);
+        const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+        const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+        a1 = (const int8_t*) ((uintptr_t) a1 + k);
+
+        const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+        w = (const void*) ((uintptr_t) w + 8);
+        const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
+
+        vacc0x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
+        vacc1x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc1x0123);
+
+        if (k > 2 * sizeof(int8_t)) {
+          const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
+          w = (const void*) ((uintptr_t) w + 8);
+          const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
+
+          vacc0x0123 = _mm_maddd_epi16(
+            _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
+          vacc1x0123 = _mm_maddd_epi16(
+            _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc1x0123);
+
+          if (k > 4 * sizeof(int8_t)) {
+            const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
+            w = (const void*) ((uintptr_t) w + 8);
+            const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
+
+            vacc0x0123 = _mm_maddd_epi16(
+              _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
+            vacc1x0123 = _mm_maddd_epi16(
+              _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc1x0123);
+          }
+        }
+      }
+      p -= 2 * sizeof(void*);
+    } while (p != 0);
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+
+    const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-igemm/gen/2x4c2-minmax-fp32-xop-ld64.c b/src/qc8-igemm/gen/2x4c2-minmax-fp32-xop-ld64.c
new file mode 100644
index 0000000..a8c6337
--- /dev/null
+++ b/src/qc8-igemm/gen/2x4c2-minmax-fp32-xop-ld64.c
@@ -0,0 +1,200 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/MRx4c2-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#if defined(__GNUC__) || defined(__clang__)
+  #include <x86intrin.h>
+#else
+  #include <immintrin.h>
+  #include <ammintrin.h>
+#endif
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld64(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 2);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (2 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 2) {
+    c1 = c0;
+  }
+
+  do {
+    __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
+    __m128i vacc1x0123 = vacc0x0123;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      a += 2;
+
+      size_t k = kc;
+      while (k >= 8 * sizeof(int8_t)) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+        a0 += 8;
+        const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+        const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+        a1 += 8;
+
+        const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+        const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
+
+        vacc0x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
+        vacc1x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc1x0123);
+        const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8));
+        const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
+
+        vacc0x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
+        vacc1x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc1x0123);
+        const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16));
+        const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
+
+        vacc0x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
+        vacc1x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc1x0123);
+        const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24));
+        const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
+
+        vacc0x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc0x0123);
+        vacc1x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc1x0123);
+
+        w = (const void*) ((uintptr_t) w + 32);
+        k -= 8 * sizeof(int8_t);
+      }
+      if (k != 0) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+        a0 = (const int8_t*) ((uintptr_t) a0 + k);
+        const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+        const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+        a1 = (const int8_t*) ((uintptr_t) a1 + k);
+
+        const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+        w = (const void*) ((uintptr_t) w + 8);
+        const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
+
+        vacc0x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
+        vacc1x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc1x0123);
+
+        if (k > 2 * sizeof(int8_t)) {
+          const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
+          w = (const void*) ((uintptr_t) w + 8);
+          const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
+
+          vacc0x0123 = _mm_maddd_epi16(
+            _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
+          vacc1x0123 = _mm_maddd_epi16(
+            _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc1x0123);
+
+          if (k > 4 * sizeof(int8_t)) {
+            const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
+            w = (const void*) ((uintptr_t) w + 8);
+            const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
+
+            vacc0x0123 = _mm_maddd_epi16(
+              _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
+            vacc1x0123 = _mm_maddd_epi16(
+              _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc1x0123);
+          }
+        }
+      }
+      p -= 2 * sizeof(void*);
+    } while (p != 0);
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+
+    const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-igemm/gen/2x4c8-minmax-fp32-avx-ld128.c b/src/qc8-igemm/gen/2x4c8-minmax-fp32-avx-ld128.c
new file mode 100644
index 0000000..0fe37e8
--- /dev/null
+++ b/src/qc8-igemm/gen/2x4c8-minmax-fp32-avx-ld128.c
@@ -0,0 +1,160 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/MRx4c8-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 2);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (2 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 2) {
+    c1 = c0;
+  }
+
+  do {
+    __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
+    __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
+    __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
+    __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
+    __m128i vacc1x0 = vacc0x0;
+    __m128i vacc1x1 = vacc0x1;
+    __m128i vacc1x2 = vacc0x2;
+    __m128i vacc1x3 = vacc0x3;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      a += 2;
+
+      size_t k = 0;
+      while (k < kc) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+        a0 += 8;
+        const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+        const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+        a1 += 8;
+
+        const __m128i vb01 = _mm_load_si128((const __m128i*) w);
+        const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
+        const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
+        const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+
+        vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
+        vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
+        vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
+        vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
+        const __m128i vb23 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16));
+        const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
+        const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
+        const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+
+        vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
+        vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
+        vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
+        vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
+
+        w = (const void*) ((uintptr_t) w + 32);
+        k += 8 * sizeof(int8_t);
+      }
+      p -= 2 * sizeof(void*);
+    } while (p != 0);
+
+    const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
+    const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
+    const __m128i vacc1x01 = _mm_hadd_epi32(vacc1x0, vacc1x1);
+    const __m128i vacc1x23 = _mm_hadd_epi32(vacc1x2, vacc1x3);
+
+    __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
+    __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+
+    const __m128 vscale0123 = _mm_load_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-igemm/gen/2x4c8-minmax-fp32-avx-ld64.c b/src/qc8-igemm/gen/2x4c8-minmax-fp32-avx-ld64.c
new file mode 100644
index 0000000..b5a2944
--- /dev/null
+++ b/src/qc8-igemm/gen/2x4c8-minmax-fp32-avx-ld64.c
@@ -0,0 +1,162 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/MRx4c8-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld64(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 2);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (2 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 2) {
+    c1 = c0;
+  }
+
+  do {
+    __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
+    __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
+    __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
+    __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
+    __m128i vacc1x0 = vacc0x0;
+    __m128i vacc1x1 = vacc0x1;
+    __m128i vacc1x2 = vacc0x2;
+    __m128i vacc1x3 = vacc0x3;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      a += 2;
+
+      size_t k = 0;
+      while (k < kc) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+        a0 += 8;
+        const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+        const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+        a1 += 8;
+
+        const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+        const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
+
+        vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
+        vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
+        const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8));
+        const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
+
+        vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
+        vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
+        const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16));
+        const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
+
+        vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
+        vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
+        const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24));
+        const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
+
+        vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
+        vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
+
+        w = (const void*) ((uintptr_t) w + 32);
+        k += 8 * sizeof(int8_t);
+      }
+      p -= 2 * sizeof(void*);
+    } while (p != 0);
+
+    const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
+    const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
+    const __m128i vacc1x01 = _mm_hadd_epi32(vacc1x0, vacc1x1);
+    const __m128i vacc1x23 = _mm_hadd_epi32(vacc1x2, vacc1x3);
+
+    __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
+    __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+
+    const __m128 vscale0123 = _mm_load_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-igemm/gen/2x4c8-minmax-fp32-sse2-ld128.c b/src/qc8-igemm/gen/2x4c8-minmax-fp32-sse2-ld128.c
new file mode 100644
index 0000000..f16164d
--- /dev/null
+++ b/src/qc8-igemm/gen/2x4c8-minmax-fp32-sse2-ld128.c
@@ -0,0 +1,161 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/MRx4c8-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld128(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 2);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (2 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 2) {
+    c1 = c0;
+  }
+
+  do {
+    __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
+    __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
+    __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
+    __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
+    __m128i vacc1x0 = vacc0x0;
+    __m128i vacc1x1 = vacc0x1;
+    __m128i vacc1x2 = vacc0x2;
+    __m128i vacc1x3 = vacc0x3;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      a += 2;
+
+      size_t k = 0;
+      while (k < kc) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_unpacklo_epi8(va0, _mm_cmpgt_epi8(_mm_setzero_si128(), va0));
+        a0 += 8;
+        const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+        const __m128i vxa1 = _mm_unpacklo_epi8(va1, _mm_cmpgt_epi8(_mm_setzero_si128(), va1));
+        a1 += 8;
+
+        const __m128i vb01 = _mm_load_si128((const __m128i*) w);
+        const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
+        const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
+        const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+
+        vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
+        vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
+        vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
+        vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
+        const __m128i vb23 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16));
+        const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
+        const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
+        const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+
+        vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
+        vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
+        vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
+        vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
+
+        w = (const void*) ((uintptr_t) w + 32);
+        k += 8 * sizeof(int8_t);
+      }
+      p -= 2 * sizeof(void*);
+    } while (p != 0);
+
+    const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2));
+    const __m128i vacc0x13 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x1, vacc0x3), _mm_unpackhi_epi32(vacc0x1, vacc0x3));
+    const __m128i vacc1x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc1x0, vacc1x2), _mm_unpackhi_epi32(vacc1x0, vacc1x2));
+    const __m128i vacc1x13 = _mm_add_epi32(_mm_unpacklo_epi32(vacc1x1, vacc1x3), _mm_unpackhi_epi32(vacc1x1, vacc1x3));
+
+    __m128i vacc0x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x02, vacc0x13), _mm_unpackhi_epi32(vacc0x02, vacc0x13));
+    __m128i vacc1x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc1x02, vacc1x13), _mm_unpackhi_epi32(vacc1x02, vacc1x13));
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+
+    const __m128 vscale0123 = _mm_load_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+
+    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
+    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
+    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
+
+
+    if (nc >= 4) {
+      *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(_mm_shuffle_epi32(vout, _MM_SHUFFLE(1, 1, 1, 1)));
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
+        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-igemm/gen/2x4c8-minmax-fp32-sse2-ld64.c b/src/qc8-igemm/gen/2x4c8-minmax-fp32-sse2-ld64.c
new file mode 100644
index 0000000..78e19f9
--- /dev/null
+++ b/src/qc8-igemm/gen/2x4c8-minmax-fp32-sse2-ld64.c
@@ -0,0 +1,163 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/MRx4c8-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld64(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 2);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (2 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 2) {
+    c1 = c0;
+  }
+
+  do {
+    __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
+    __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
+    __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
+    __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
+    __m128i vacc1x0 = vacc0x0;
+    __m128i vacc1x1 = vacc0x1;
+    __m128i vacc1x2 = vacc0x2;
+    __m128i vacc1x3 = vacc0x3;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      a += 2;
+
+      size_t k = 0;
+      while (k < kc) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_unpacklo_epi8(va0, _mm_cmpgt_epi8(_mm_setzero_si128(), va0));
+        a0 += 8;
+        const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+        const __m128i vxa1 = _mm_unpacklo_epi8(va1, _mm_cmpgt_epi8(_mm_setzero_si128(), va1));
+        a1 += 8;
+
+        const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+        const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
+
+        vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
+        vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
+        const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8));
+        const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
+
+        vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
+        vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
+        const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16));
+        const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
+
+        vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
+        vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
+        const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24));
+        const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
+
+        vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
+        vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
+
+        w = (const void*) ((uintptr_t) w + 32);
+        k += 8 * sizeof(int8_t);
+      }
+      p -= 2 * sizeof(void*);
+    } while (p != 0);
+
+    const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2));
+    const __m128i vacc0x13 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x1, vacc0x3), _mm_unpackhi_epi32(vacc0x1, vacc0x3));
+    const __m128i vacc1x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc1x0, vacc1x2), _mm_unpackhi_epi32(vacc1x0, vacc1x2));
+    const __m128i vacc1x13 = _mm_add_epi32(_mm_unpacklo_epi32(vacc1x1, vacc1x3), _mm_unpackhi_epi32(vacc1x1, vacc1x3));
+
+    __m128i vacc0x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x02, vacc0x13), _mm_unpackhi_epi32(vacc0x02, vacc0x13));
+    __m128i vacc1x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc1x02, vacc1x13), _mm_unpackhi_epi32(vacc1x02, vacc1x13));
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+
+    const __m128 vscale0123 = _mm_load_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+
+    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
+    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
+    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
+
+
+    if (nc >= 4) {
+      *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(_mm_shuffle_epi32(vout, _MM_SHUFFLE(1, 1, 1, 1)));
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
+        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-igemm/gen/2x4c8-minmax-fp32-sse41-ld128.c b/src/qc8-igemm/gen/2x4c8-minmax-fp32-sse41-ld128.c
new file mode 100644
index 0000000..1b6e3bc
--- /dev/null
+++ b/src/qc8-igemm/gen/2x4c8-minmax-fp32-sse41-ld128.c
@@ -0,0 +1,160 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/MRx4c8-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld128(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 2);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (2 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 2) {
+    c1 = c0;
+  }
+
+  do {
+    __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
+    __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
+    __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
+    __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
+    __m128i vacc1x0 = vacc0x0;
+    __m128i vacc1x1 = vacc0x1;
+    __m128i vacc1x2 = vacc0x2;
+    __m128i vacc1x3 = vacc0x3;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      a += 2;
+
+      size_t k = 0;
+      while (k < kc) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+        a0 += 8;
+        const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+        const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+        a1 += 8;
+
+        const __m128i vb01 = _mm_load_si128((const __m128i*) w);
+        const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
+        const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
+        const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+
+        vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
+        vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
+        vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
+        vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
+        const __m128i vb23 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16));
+        const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
+        const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
+        const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+
+        vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
+        vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
+        vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
+        vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
+
+        w = (const void*) ((uintptr_t) w + 32);
+        k += 8 * sizeof(int8_t);
+      }
+      p -= 2 * sizeof(void*);
+    } while (p != 0);
+
+    const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
+    const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
+    const __m128i vacc1x01 = _mm_hadd_epi32(vacc1x0, vacc1x1);
+    const __m128i vacc1x23 = _mm_hadd_epi32(vacc1x2, vacc1x3);
+
+    __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
+    __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+
+    const __m128 vscale0123 = _mm_load_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-igemm/gen/2x4c8-minmax-fp32-sse41-ld64.c b/src/qc8-igemm/gen/2x4c8-minmax-fp32-sse41-ld64.c
new file mode 100644
index 0000000..49ce8d2
--- /dev/null
+++ b/src/qc8-igemm/gen/2x4c8-minmax-fp32-sse41-ld64.c
@@ -0,0 +1,162 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/MRx4c8-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld64(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 2);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (2 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 2) {
+    c1 = c0;
+  }
+
+  do {
+    __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
+    __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
+    __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
+    __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
+    __m128i vacc1x0 = vacc0x0;
+    __m128i vacc1x1 = vacc0x1;
+    __m128i vacc1x2 = vacc0x2;
+    __m128i vacc1x3 = vacc0x3;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      a += 2;
+
+      size_t k = 0;
+      while (k < kc) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+        a0 += 8;
+        const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+        const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+        a1 += 8;
+
+        const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+        const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
+
+        vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
+        vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
+        const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8));
+        const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
+
+        vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
+        vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
+        const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16));
+        const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
+
+        vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
+        vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
+        const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24));
+        const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
+
+        vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
+        vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
+
+        w = (const void*) ((uintptr_t) w + 32);
+        k += 8 * sizeof(int8_t);
+      }
+      p -= 2 * sizeof(void*);
+    } while (p != 0);
+
+    const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
+    const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
+    const __m128i vacc1x01 = _mm_hadd_epi32(vacc1x0, vacc1x1);
+    const __m128i vacc1x23 = _mm_hadd_epi32(vacc1x2, vacc1x3);
+
+    __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
+    __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+
+    const __m128 vscale0123 = _mm_load_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-igemm/gen/2x4c8-minmax-fp32-xop-ld128.c b/src/qc8-igemm/gen/2x4c8-minmax-fp32-xop-ld128.c
new file mode 100644
index 0000000..8c36446
--- /dev/null
+++ b/src/qc8-igemm/gen/2x4c8-minmax-fp32-xop-ld128.c
@@ -0,0 +1,165 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/MRx4c8-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#if defined(__GNUC__) || defined(__clang__)
+  #include <x86intrin.h>
+#else
+  #include <immintrin.h>
+  #include <ammintrin.h>
+#endif
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld128(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 2);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (2 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 2) {
+    c1 = c0;
+  }
+
+  do {
+    __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
+    __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
+    __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
+    __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
+    __m128i vacc1x0 = vacc0x0;
+    __m128i vacc1x1 = vacc0x1;
+    __m128i vacc1x2 = vacc0x2;
+    __m128i vacc1x3 = vacc0x3;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      a += 2;
+
+      size_t k = 0;
+      while (k < kc) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+        a0 += 8;
+        const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+        const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+        a1 += 8;
+
+        const __m128i vb01 = _mm_load_si128((const __m128i*) w);
+        const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
+        const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
+        const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+
+        vacc0x0 = _mm_maddd_epi16(vxa0, vxb0, vacc0x0);
+        vacc0x1 = _mm_maddd_epi16(vxa0, vxb1, vacc0x1);
+        vacc1x0 = _mm_maddd_epi16(vxa1, vxb0, vacc1x0);
+        vacc1x1 = _mm_maddd_epi16(vxa1, vxb1, vacc1x1);
+        const __m128i vb23 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16));
+        const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
+        const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
+        const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+
+        vacc0x2 = _mm_maddd_epi16(vxa0, vxb2, vacc0x2);
+        vacc0x3 = _mm_maddd_epi16(vxa0, vxb3, vacc0x3);
+        vacc1x2 = _mm_maddd_epi16(vxa1, vxb2, vacc1x2);
+        vacc1x3 = _mm_maddd_epi16(vxa1, vxb3, vacc1x3);
+
+        w = (const void*) ((uintptr_t) w + 32);
+        k += 8 * sizeof(int8_t);
+      }
+      p -= 2 * sizeof(void*);
+    } while (p != 0);
+
+    const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
+    const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
+    const __m128i vacc1x01 = _mm_hadd_epi32(vacc1x0, vacc1x1);
+    const __m128i vacc1x23 = _mm_hadd_epi32(vacc1x2, vacc1x3);
+
+    __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
+    __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+
+    const __m128 vscale0123 = _mm_load_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-igemm/gen/2x4c8-minmax-fp32-xop-ld64.c b/src/qc8-igemm/gen/2x4c8-minmax-fp32-xop-ld64.c
new file mode 100644
index 0000000..046e703
--- /dev/null
+++ b/src/qc8-igemm/gen/2x4c8-minmax-fp32-xop-ld64.c
@@ -0,0 +1,167 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/MRx4c8-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#if defined(__GNUC__) || defined(__clang__)
+  #include <x86intrin.h>
+#else
+  #include <immintrin.h>
+  #include <ammintrin.h>
+#endif
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 2);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (2 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 2) {
+    c1 = c0;
+  }
+
+  do {
+    __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
+    __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
+    __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
+    __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
+    __m128i vacc1x0 = vacc0x0;
+    __m128i vacc1x1 = vacc0x1;
+    __m128i vacc1x2 = vacc0x2;
+    __m128i vacc1x3 = vacc0x3;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      a += 2;
+
+      size_t k = 0;
+      while (k < kc) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+        a0 += 8;
+        const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+        const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+        a1 += 8;
+
+        const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+        const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
+
+        vacc0x0 = _mm_maddd_epi16(vxa0, vxb0, vacc0x0);
+        vacc1x0 = _mm_maddd_epi16(vxa1, vxb0, vacc1x0);
+        const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8));
+        const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
+
+        vacc0x1 = _mm_maddd_epi16(vxa0, vxb1, vacc0x1);
+        vacc1x1 = _mm_maddd_epi16(vxa1, vxb1, vacc1x1);
+        const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16));
+        const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
+
+        vacc0x2 = _mm_maddd_epi16(vxa0, vxb2, vacc0x2);
+        vacc1x2 = _mm_maddd_epi16(vxa1, vxb2, vacc1x2);
+        const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24));
+        const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
+
+        vacc0x3 = _mm_maddd_epi16(vxa0, vxb3, vacc0x3);
+        vacc1x3 = _mm_maddd_epi16(vxa1, vxb3, vacc1x3);
+
+        w = (const void*) ((uintptr_t) w + 32);
+        k += 8 * sizeof(int8_t);
+      }
+      p -= 2 * sizeof(void*);
+    } while (p != 0);
+
+    const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
+    const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
+    const __m128i vacc1x01 = _mm_hadd_epi32(vacc1x0, vacc1x1);
+    const __m128i vacc1x23 = _mm_hadd_epi32(vacc1x2, vacc1x3);
+
+    __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
+    __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+
+    const __m128 vscale0123 = _mm_load_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-igemm/gen/3x4c2-minmax-fp32-avx-ld128.c b/src/qc8-igemm/gen/3x4c2-minmax-fp32-avx-ld128.c
new file mode 100644
index 0000000..4e873a6
--- /dev/null
+++ b/src/qc8-igemm/gen/3x4c2-minmax-fp32-avx-ld128.c
@@ -0,0 +1,233 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/MRx4c2-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld128(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 3);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (3 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    c1 = c0;
+  }
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    c2 = c1;
+  }
+
+  do {
+    __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
+    __m128i vacc1x0123 = vacc0x0123;
+    __m128i vacc2x0123 = vacc0x0123;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      const int8_t* restrict a2 = a[2];
+      if XNN_UNPREDICTABLE(a2 != zero) {
+        a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
+      }
+      a += 3;
+
+      size_t k = kc;
+      while (k >= 8 * sizeof(int8_t)) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+        a0 += 8;
+        const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+        const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+        a1 += 8;
+        const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+        const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
+        a2 += 8;
+
+        const __m128i vb01 = _mm_loadu_si128((const __m128i*) w);
+        const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
+        const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
+        const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        vacc2x0123 = _mm_add_epi32(vacc2x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        vacc2x0123 = _mm_add_epi32(vacc2x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16));
+        const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
+        const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
+        const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+        vacc2x0123 = _mm_add_epi32(vacc2x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+        vacc2x0123 = _mm_add_epi32(vacc2x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+
+        w = (const void*) ((uintptr_t) w + 32);
+        k -= 8 * sizeof(int8_t);
+      }
+      if (k != 0) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+        a0 = (const int8_t*) ((uintptr_t) a0 + k);
+        const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+        const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+        a1 = (const int8_t*) ((uintptr_t) a1 + k);
+        const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+        const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
+        a2 = (const int8_t*) ((uintptr_t) a2 + k);
+
+        const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+        w = (const void*) ((uintptr_t) w + 8);
+        const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        vacc2x0123 = _mm_add_epi32(vacc2x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+        if (k > 2 * sizeof(int8_t)) {
+          const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
+          w = (const void*) ((uintptr_t) w + 8);
+          const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
+
+          vacc0x0123 = _mm_add_epi32(vacc0x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+          vacc1x0123 = _mm_add_epi32(vacc1x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+          vacc2x0123 = _mm_add_epi32(vacc2x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+
+          if (k > 4 * sizeof(int8_t)) {
+            const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
+            w = (const void*) ((uintptr_t) w + 8);
+            const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
+
+            vacc0x0123 = _mm_add_epi32(vacc0x0123,
+              _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+            vacc1x0123 = _mm_add_epi32(vacc1x0123,
+              _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+            vacc2x0123 = _mm_add_epi32(vacc2x0123,
+              _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+          }
+        }
+      }
+      p -= 3 * sizeof(void*);
+    } while (p != 0);
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+    __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
+
+    const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+    vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+    vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+    __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4);
+        c2 += 2;
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-igemm/gen/3x4c2-minmax-fp32-avx-ld64.c b/src/qc8-igemm/gen/3x4c2-minmax-fp32-avx-ld64.c
new file mode 100644
index 0000000..1210e2c
--- /dev/null
+++ b/src/qc8-igemm/gen/3x4c2-minmax-fp32-avx-ld64.c
@@ -0,0 +1,233 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/MRx4c2-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld64(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 3);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (3 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    c1 = c0;
+  }
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    c2 = c1;
+  }
+
+  do {
+    __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
+    __m128i vacc1x0123 = vacc0x0123;
+    __m128i vacc2x0123 = vacc0x0123;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      const int8_t* restrict a2 = a[2];
+      if XNN_UNPREDICTABLE(a2 != zero) {
+        a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
+      }
+      a += 3;
+
+      size_t k = kc;
+      while (k >= 8 * sizeof(int8_t)) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+        a0 += 8;
+        const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+        const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+        a1 += 8;
+        const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+        const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
+        a2 += 8;
+
+        const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+        const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        vacc2x0123 = _mm_add_epi32(vacc2x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8));
+        const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        vacc2x0123 = _mm_add_epi32(vacc2x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16));
+        const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+        vacc2x0123 = _mm_add_epi32(vacc2x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+        const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24));
+        const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+        vacc2x0123 = _mm_add_epi32(vacc2x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+
+        w = (const void*) ((uintptr_t) w + 32);
+        k -= 8 * sizeof(int8_t);
+      }
+      if (k != 0) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+        a0 = (const int8_t*) ((uintptr_t) a0 + k);
+        const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+        const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+        a1 = (const int8_t*) ((uintptr_t) a1 + k);
+        const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+        const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
+        a2 = (const int8_t*) ((uintptr_t) a2 + k);
+
+        const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+        w = (const void*) ((uintptr_t) w + 8);
+        const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        vacc2x0123 = _mm_add_epi32(vacc2x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+        if (k > 2 * sizeof(int8_t)) {
+          const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
+          w = (const void*) ((uintptr_t) w + 8);
+          const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
+
+          vacc0x0123 = _mm_add_epi32(vacc0x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+          vacc1x0123 = _mm_add_epi32(vacc1x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+          vacc2x0123 = _mm_add_epi32(vacc2x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+
+          if (k > 4 * sizeof(int8_t)) {
+            const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
+            w = (const void*) ((uintptr_t) w + 8);
+            const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
+
+            vacc0x0123 = _mm_add_epi32(vacc0x0123,
+              _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+            vacc1x0123 = _mm_add_epi32(vacc1x0123,
+              _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+            vacc2x0123 = _mm_add_epi32(vacc2x0123,
+              _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+          }
+        }
+      }
+      p -= 3 * sizeof(void*);
+    } while (p != 0);
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+    __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
+
+    const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+    vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+    vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+    __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4);
+        c2 += 2;
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-igemm/gen/3x4c2-minmax-fp32-sse2-ld128.c b/src/qc8-igemm/gen/3x4c2-minmax-fp32-sse2-ld128.c
new file mode 100644
index 0000000..387196c
--- /dev/null
+++ b/src/qc8-igemm/gen/3x4c2-minmax-fp32-sse2-ld128.c
@@ -0,0 +1,235 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/MRx4c2-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld128(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 3);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (3 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    c1 = c0;
+  }
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    c2 = c1;
+  }
+
+  do {
+    __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
+    __m128i vacc1x0123 = vacc0x0123;
+    __m128i vacc2x0123 = vacc0x0123;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      const int8_t* restrict a2 = a[2];
+      if XNN_UNPREDICTABLE(a2 != zero) {
+        a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
+      }
+      a += 3;
+
+      size_t k = kc;
+      while (k >= 8 * sizeof(int8_t)) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_unpacklo_epi8(va0, _mm_cmpgt_epi8(_mm_setzero_si128(), va0));
+        a0 += 8;
+        const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+        const __m128i vxa1 = _mm_unpacklo_epi8(va1, _mm_cmpgt_epi8(_mm_setzero_si128(), va1));
+        a1 += 8;
+        const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+        const __m128i vxa2 = _mm_unpacklo_epi8(va2, _mm_cmpgt_epi8(_mm_setzero_si128(), va2));
+        a2 += 8;
+
+        const __m128i vb01 = _mm_loadu_si128((const __m128i*) w);
+        const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
+        const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
+        const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        vacc2x0123 = _mm_add_epi32(vacc2x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        vacc2x0123 = _mm_add_epi32(vacc2x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16));
+        const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
+        const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
+        const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+        vacc2x0123 = _mm_add_epi32(vacc2x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+        vacc2x0123 = _mm_add_epi32(vacc2x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+
+        w = (const void*) ((uintptr_t) w + 32);
+        k -= 8 * sizeof(int8_t);
+      }
+      if (k != 0) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_unpacklo_epi8(va0, _mm_cmpgt_epi8(_mm_setzero_si128(), va0));
+        a0 = (const int8_t*) ((uintptr_t) a0 + k);
+        const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+        const __m128i vxa1 = _mm_unpacklo_epi8(va1, _mm_cmpgt_epi8(_mm_setzero_si128(), va1));
+        a1 = (const int8_t*) ((uintptr_t) a1 + k);
+        const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+        const __m128i vxa2 = _mm_unpacklo_epi8(va2, _mm_cmpgt_epi8(_mm_setzero_si128(), va2));
+        a2 = (const int8_t*) ((uintptr_t) a2 + k);
+
+        const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+        w = (const void*) ((uintptr_t) w + 8);
+        const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        vacc2x0123 = _mm_add_epi32(vacc2x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+        if (k > 2 * sizeof(int8_t)) {
+          const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
+          w = (const void*) ((uintptr_t) w + 8);
+          const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
+
+          vacc0x0123 = _mm_add_epi32(vacc0x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+          vacc1x0123 = _mm_add_epi32(vacc1x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+          vacc2x0123 = _mm_add_epi32(vacc2x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+
+          if (k > 4 * sizeof(int8_t)) {
+            const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
+            w = (const void*) ((uintptr_t) w + 8);
+            const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
+
+            vacc0x0123 = _mm_add_epi32(vacc0x0123,
+              _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+            vacc1x0123 = _mm_add_epi32(vacc1x0123,
+              _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+            vacc2x0123 = _mm_add_epi32(vacc2x0123,
+              _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+          }
+        }
+      }
+      p -= 3 * sizeof(void*);
+    } while (p != 0);
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+    __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
+
+    const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+    vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+    vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+    __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
+
+    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
+    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
+    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
+    vacc22x0123 = _mm_min_epi16(_mm_max_epi16(vacc22x0123, voutput_min), voutput_max);
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
+
+
+    if (nc >= 4) {
+      *((uint32_t*) c2) = (uint32_t) _mm_cvtsi128_si32(_mm_shuffle_epi32(vout, _MM_SHUFFLE(2, 2, 2, 2)));
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(_mm_shuffle_epi32(vout, _MM_SHUFFLE(1, 1, 1, 1)));
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4);
+        c2 += 2;
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
+        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-igemm/gen/3x4c2-minmax-fp32-sse2-ld64.c b/src/qc8-igemm/gen/3x4c2-minmax-fp32-sse2-ld64.c
new file mode 100644
index 0000000..3da6524
--- /dev/null
+++ b/src/qc8-igemm/gen/3x4c2-minmax-fp32-sse2-ld64.c
@@ -0,0 +1,235 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/MRx4c2-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld64(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 3);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (3 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    c1 = c0;
+  }
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    c2 = c1;
+  }
+
+  do {
+    __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
+    __m128i vacc1x0123 = vacc0x0123;
+    __m128i vacc2x0123 = vacc0x0123;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      const int8_t* restrict a2 = a[2];
+      if XNN_UNPREDICTABLE(a2 != zero) {
+        a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
+      }
+      a += 3;
+
+      size_t k = kc;
+      while (k >= 8 * sizeof(int8_t)) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_unpacklo_epi8(va0, _mm_cmpgt_epi8(_mm_setzero_si128(), va0));
+        a0 += 8;
+        const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+        const __m128i vxa1 = _mm_unpacklo_epi8(va1, _mm_cmpgt_epi8(_mm_setzero_si128(), va1));
+        a1 += 8;
+        const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+        const __m128i vxa2 = _mm_unpacklo_epi8(va2, _mm_cmpgt_epi8(_mm_setzero_si128(), va2));
+        a2 += 8;
+
+        const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+        const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        vacc2x0123 = _mm_add_epi32(vacc2x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8));
+        const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        vacc2x0123 = _mm_add_epi32(vacc2x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16));
+        const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+        vacc2x0123 = _mm_add_epi32(vacc2x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+        const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24));
+        const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+        vacc2x0123 = _mm_add_epi32(vacc2x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+
+        w = (const void*) ((uintptr_t) w + 32);
+        k -= 8 * sizeof(int8_t);
+      }
+      if (k != 0) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_unpacklo_epi8(va0, _mm_cmpgt_epi8(_mm_setzero_si128(), va0));
+        a0 = (const int8_t*) ((uintptr_t) a0 + k);
+        const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+        const __m128i vxa1 = _mm_unpacklo_epi8(va1, _mm_cmpgt_epi8(_mm_setzero_si128(), va1));
+        a1 = (const int8_t*) ((uintptr_t) a1 + k);
+        const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+        const __m128i vxa2 = _mm_unpacklo_epi8(va2, _mm_cmpgt_epi8(_mm_setzero_si128(), va2));
+        a2 = (const int8_t*) ((uintptr_t) a2 + k);
+
+        const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+        w = (const void*) ((uintptr_t) w + 8);
+        const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        vacc2x0123 = _mm_add_epi32(vacc2x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+        if (k > 2 * sizeof(int8_t)) {
+          const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
+          w = (const void*) ((uintptr_t) w + 8);
+          const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
+
+          vacc0x0123 = _mm_add_epi32(vacc0x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+          vacc1x0123 = _mm_add_epi32(vacc1x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+          vacc2x0123 = _mm_add_epi32(vacc2x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+
+          if (k > 4 * sizeof(int8_t)) {
+            const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
+            w = (const void*) ((uintptr_t) w + 8);
+            const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
+
+            vacc0x0123 = _mm_add_epi32(vacc0x0123,
+              _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+            vacc1x0123 = _mm_add_epi32(vacc1x0123,
+              _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+            vacc2x0123 = _mm_add_epi32(vacc2x0123,
+              _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+          }
+        }
+      }
+      p -= 3 * sizeof(void*);
+    } while (p != 0);
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+    __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
+
+    const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+    vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+    vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+    __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
+
+    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
+    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
+    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
+    vacc22x0123 = _mm_min_epi16(_mm_max_epi16(vacc22x0123, voutput_min), voutput_max);
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
+
+
+    if (nc >= 4) {
+      *((uint32_t*) c2) = (uint32_t) _mm_cvtsi128_si32(_mm_shuffle_epi32(vout, _MM_SHUFFLE(2, 2, 2, 2)));
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(_mm_shuffle_epi32(vout, _MM_SHUFFLE(1, 1, 1, 1)));
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4);
+        c2 += 2;
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
+        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-igemm/gen/3x4c2-minmax-fp32-sse41-ld128.c b/src/qc8-igemm/gen/3x4c2-minmax-fp32-sse41-ld128.c
new file mode 100644
index 0000000..3e9f7e0
--- /dev/null
+++ b/src/qc8-igemm/gen/3x4c2-minmax-fp32-sse41-ld128.c
@@ -0,0 +1,233 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/MRx4c2-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld128(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 3);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (3 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    c1 = c0;
+  }
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    c2 = c1;
+  }
+
+  do {
+    __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
+    __m128i vacc1x0123 = vacc0x0123;
+    __m128i vacc2x0123 = vacc0x0123;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      const int8_t* restrict a2 = a[2];
+      if XNN_UNPREDICTABLE(a2 != zero) {
+        a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
+      }
+      a += 3;
+
+      size_t k = kc;
+      while (k >= 8 * sizeof(int8_t)) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+        a0 += 8;
+        const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+        const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+        a1 += 8;
+        const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+        const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
+        a2 += 8;
+
+        const __m128i vb01 = _mm_loadu_si128((const __m128i*) w);
+        const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
+        const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
+        const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        vacc2x0123 = _mm_add_epi32(vacc2x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        vacc2x0123 = _mm_add_epi32(vacc2x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16));
+        const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
+        const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
+        const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+        vacc2x0123 = _mm_add_epi32(vacc2x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+        vacc2x0123 = _mm_add_epi32(vacc2x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+
+        w = (const void*) ((uintptr_t) w + 32);
+        k -= 8 * sizeof(int8_t);
+      }
+      if (k != 0) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+        a0 = (const int8_t*) ((uintptr_t) a0 + k);
+        const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+        const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+        a1 = (const int8_t*) ((uintptr_t) a1 + k);
+        const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+        const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
+        a2 = (const int8_t*) ((uintptr_t) a2 + k);
+
+        const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+        w = (const void*) ((uintptr_t) w + 8);
+        const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        vacc2x0123 = _mm_add_epi32(vacc2x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+        if (k > 2 * sizeof(int8_t)) {
+          const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
+          w = (const void*) ((uintptr_t) w + 8);
+          const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
+
+          vacc0x0123 = _mm_add_epi32(vacc0x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+          vacc1x0123 = _mm_add_epi32(vacc1x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+          vacc2x0123 = _mm_add_epi32(vacc2x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+
+          if (k > 4 * sizeof(int8_t)) {
+            const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
+            w = (const void*) ((uintptr_t) w + 8);
+            const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
+
+            vacc0x0123 = _mm_add_epi32(vacc0x0123,
+              _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+            vacc1x0123 = _mm_add_epi32(vacc1x0123,
+              _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+            vacc2x0123 = _mm_add_epi32(vacc2x0123,
+              _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+          }
+        }
+      }
+      p -= 3 * sizeof(void*);
+    } while (p != 0);
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+    __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
+
+    const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+    vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+    vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+    __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4);
+        c2 += 2;
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-igemm/gen/3x4c2-minmax-fp32-sse41-ld64.c b/src/qc8-igemm/gen/3x4c2-minmax-fp32-sse41-ld64.c
new file mode 100644
index 0000000..b060bfa
--- /dev/null
+++ b/src/qc8-igemm/gen/3x4c2-minmax-fp32-sse41-ld64.c
@@ -0,0 +1,233 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/MRx4c2-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld64(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 3);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (3 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    c1 = c0;
+  }
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    c2 = c1;
+  }
+
+  do {
+    __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
+    __m128i vacc1x0123 = vacc0x0123;
+    __m128i vacc2x0123 = vacc0x0123;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      const int8_t* restrict a2 = a[2];
+      if XNN_UNPREDICTABLE(a2 != zero) {
+        a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
+      }
+      a += 3;
+
+      size_t k = kc;
+      while (k >= 8 * sizeof(int8_t)) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+        a0 += 8;
+        const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+        const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+        a1 += 8;
+        const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+        const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
+        a2 += 8;
+
+        const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+        const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        vacc2x0123 = _mm_add_epi32(vacc2x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8));
+        const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        vacc2x0123 = _mm_add_epi32(vacc2x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16));
+        const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+        vacc2x0123 = _mm_add_epi32(vacc2x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+        const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24));
+        const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+        vacc2x0123 = _mm_add_epi32(vacc2x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+
+        w = (const void*) ((uintptr_t) w + 32);
+        k -= 8 * sizeof(int8_t);
+      }
+      if (k != 0) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+        a0 = (const int8_t*) ((uintptr_t) a0 + k);
+        const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+        const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+        a1 = (const int8_t*) ((uintptr_t) a1 + k);
+        const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+        const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
+        a2 = (const int8_t*) ((uintptr_t) a2 + k);
+
+        const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+        w = (const void*) ((uintptr_t) w + 8);
+        const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        vacc2x0123 = _mm_add_epi32(vacc2x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+        if (k > 2 * sizeof(int8_t)) {
+          const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
+          w = (const void*) ((uintptr_t) w + 8);
+          const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
+
+          vacc0x0123 = _mm_add_epi32(vacc0x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+          vacc1x0123 = _mm_add_epi32(vacc1x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+          vacc2x0123 = _mm_add_epi32(vacc2x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+
+          if (k > 4 * sizeof(int8_t)) {
+            const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
+            w = (const void*) ((uintptr_t) w + 8);
+            const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
+
+            vacc0x0123 = _mm_add_epi32(vacc0x0123,
+              _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+            vacc1x0123 = _mm_add_epi32(vacc1x0123,
+              _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+            vacc2x0123 = _mm_add_epi32(vacc2x0123,
+              _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+          }
+        }
+      }
+      p -= 3 * sizeof(void*);
+    } while (p != 0);
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+    __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
+
+    const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+    vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+    vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+    __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4);
+        c2 += 2;
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-igemm/gen/3x4c2-minmax-fp32-xop-ld128.c b/src/qc8-igemm/gen/3x4c2-minmax-fp32-xop-ld128.c
new file mode 100644
index 0000000..06006b8
--- /dev/null
+++ b/src/qc8-igemm/gen/3x4c2-minmax-fp32-xop-ld128.c
@@ -0,0 +1,238 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/MRx4c2-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#if defined(__GNUC__) || defined(__clang__)
+  #include <x86intrin.h>
+#else
+  #include <immintrin.h>
+  #include <ammintrin.h>
+#endif
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld128(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 3);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (3 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    c1 = c0;
+  }
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    c2 = c1;
+  }
+
+  do {
+    __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
+    __m128i vacc1x0123 = vacc0x0123;
+    __m128i vacc2x0123 = vacc0x0123;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      const int8_t* restrict a2 = a[2];
+      if XNN_UNPREDICTABLE(a2 != zero) {
+        a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
+      }
+      a += 3;
+
+      size_t k = kc;
+      while (k >= 8 * sizeof(int8_t)) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+        a0 += 8;
+        const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+        const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+        a1 += 8;
+        const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+        const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
+        a2 += 8;
+
+        const __m128i vb01 = _mm_loadu_si128((const __m128i*) w);
+        const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
+        const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
+        const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+
+        vacc0x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
+        vacc1x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc1x0123);
+        vacc2x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc2x0123);
+
+        vacc0x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
+        vacc1x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc1x0123);
+        vacc2x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc2x0123);
+        const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16));
+        const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
+        const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
+        const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+
+        vacc0x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
+        vacc1x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc1x0123);
+        vacc2x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc2x0123);
+
+        vacc0x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc0x0123);
+        vacc1x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc1x0123);
+        vacc2x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc2x0123);
+
+        w = (const void*) ((uintptr_t) w + 32);
+        k -= 8 * sizeof(int8_t);
+      }
+      if (k != 0) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+        a0 = (const int8_t*) ((uintptr_t) a0 + k);
+        const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+        const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+        a1 = (const int8_t*) ((uintptr_t) a1 + k);
+        const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+        const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
+        a2 = (const int8_t*) ((uintptr_t) a2 + k);
+
+        const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+        w = (const void*) ((uintptr_t) w + 8);
+        const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
+
+        vacc0x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
+        vacc1x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc1x0123);
+        vacc2x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc2x0123);
+
+        if (k > 2 * sizeof(int8_t)) {
+          const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
+          w = (const void*) ((uintptr_t) w + 8);
+          const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
+
+          vacc0x0123 = _mm_maddd_epi16(
+            _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
+          vacc1x0123 = _mm_maddd_epi16(
+            _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc1x0123);
+          vacc2x0123 = _mm_maddd_epi16(
+            _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc2x0123);
+
+          if (k > 4 * sizeof(int8_t)) {
+            const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
+            w = (const void*) ((uintptr_t) w + 8);
+            const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
+
+            vacc0x0123 = _mm_maddd_epi16(
+              _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
+            vacc1x0123 = _mm_maddd_epi16(
+              _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc1x0123);
+            vacc2x0123 = _mm_maddd_epi16(
+              _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc2x0123);
+          }
+        }
+      }
+      p -= 3 * sizeof(void*);
+    } while (p != 0);
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+    __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
+
+    const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+    vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+    vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+    __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4);
+        c2 += 2;
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-igemm/gen/3x4c2-minmax-fp32-xop-ld64.c b/src/qc8-igemm/gen/3x4c2-minmax-fp32-xop-ld64.c
new file mode 100644
index 0000000..807bef7
--- /dev/null
+++ b/src/qc8-igemm/gen/3x4c2-minmax-fp32-xop-ld64.c
@@ -0,0 +1,238 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/MRx4c2-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#if defined(__GNUC__) || defined(__clang__)
+  #include <x86intrin.h>
+#else
+  #include <immintrin.h>
+  #include <ammintrin.h>
+#endif
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld64(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 3);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (3 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    c1 = c0;
+  }
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    c2 = c1;
+  }
+
+  do {
+    __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
+    __m128i vacc1x0123 = vacc0x0123;
+    __m128i vacc2x0123 = vacc0x0123;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      const int8_t* restrict a2 = a[2];
+      if XNN_UNPREDICTABLE(a2 != zero) {
+        a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
+      }
+      a += 3;
+
+      size_t k = kc;
+      while (k >= 8 * sizeof(int8_t)) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+        a0 += 8;
+        const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+        const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+        a1 += 8;
+        const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+        const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
+        a2 += 8;
+
+        const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+        const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
+
+        vacc0x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
+        vacc1x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc1x0123);
+        vacc2x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc2x0123);
+        const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8));
+        const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
+
+        vacc0x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
+        vacc1x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc1x0123);
+        vacc2x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc2x0123);
+        const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16));
+        const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
+
+        vacc0x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
+        vacc1x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc1x0123);
+        vacc2x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc2x0123);
+        const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24));
+        const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
+
+        vacc0x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc0x0123);
+        vacc1x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc1x0123);
+        vacc2x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc2x0123);
+
+        w = (const void*) ((uintptr_t) w + 32);
+        k -= 8 * sizeof(int8_t);
+      }
+      if (k != 0) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+        a0 = (const int8_t*) ((uintptr_t) a0 + k);
+        const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+        const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+        a1 = (const int8_t*) ((uintptr_t) a1 + k);
+        const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+        const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
+        a2 = (const int8_t*) ((uintptr_t) a2 + k);
+
+        const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+        w = (const void*) ((uintptr_t) w + 8);
+        const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
+
+        vacc0x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
+        vacc1x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc1x0123);
+        vacc2x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc2x0123);
+
+        if (k > 2 * sizeof(int8_t)) {
+          const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
+          w = (const void*) ((uintptr_t) w + 8);
+          const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
+
+          vacc0x0123 = _mm_maddd_epi16(
+            _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
+          vacc1x0123 = _mm_maddd_epi16(
+            _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc1x0123);
+          vacc2x0123 = _mm_maddd_epi16(
+            _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc2x0123);
+
+          if (k > 4 * sizeof(int8_t)) {
+            const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
+            w = (const void*) ((uintptr_t) w + 8);
+            const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
+
+            vacc0x0123 = _mm_maddd_epi16(
+              _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
+            vacc1x0123 = _mm_maddd_epi16(
+              _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc1x0123);
+            vacc2x0123 = _mm_maddd_epi16(
+              _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc2x0123);
+          }
+        }
+      }
+      p -= 3 * sizeof(void*);
+    } while (p != 0);
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+    __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
+
+    const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+    vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+    vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+    __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4);
+        c2 += 2;
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-igemm/gen/3x4c8-minmax-fp32-avx-ld128.c b/src/qc8-igemm/gen/3x4c8-minmax-fp32-avx-ld128.c
new file mode 100644
index 0000000..f4d21ea
--- /dev/null
+++ b/src/qc8-igemm/gen/3x4c8-minmax-fp32-avx-ld128.c
@@ -0,0 +1,191 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/MRx4c8-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld128(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 3);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (3 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    c1 = c0;
+  }
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    c2 = c1;
+  }
+
+  do {
+    __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
+    __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
+    __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
+    __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
+    __m128i vacc1x0 = vacc0x0;
+    __m128i vacc1x1 = vacc0x1;
+    __m128i vacc1x2 = vacc0x2;
+    __m128i vacc1x3 = vacc0x3;
+    __m128i vacc2x0 = vacc0x0;
+    __m128i vacc2x1 = vacc0x1;
+    __m128i vacc2x2 = vacc0x2;
+    __m128i vacc2x3 = vacc0x3;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      const int8_t* restrict a2 = a[2];
+      if XNN_UNPREDICTABLE(a2 != zero) {
+        a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
+      }
+      a += 3;
+
+      size_t k = 0;
+      while (k < kc) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+        a0 += 8;
+        const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+        const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+        a1 += 8;
+        const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+        const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
+        a2 += 8;
+
+        const __m128i vb01 = _mm_load_si128((const __m128i*) w);
+        const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
+        const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
+        const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+
+        vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
+        vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
+        vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
+        vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
+        vacc2x0 = _mm_add_epi32(vacc2x0, _mm_madd_epi16(vxa2, vxb0));
+        vacc2x1 = _mm_add_epi32(vacc2x1, _mm_madd_epi16(vxa2, vxb1));
+        const __m128i vb23 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16));
+        const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
+        const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
+        const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+
+        vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
+        vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
+        vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
+        vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
+        vacc2x2 = _mm_add_epi32(vacc2x2, _mm_madd_epi16(vxa2, vxb2));
+        vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3));
+
+        w = (const void*) ((uintptr_t) w + 32);
+        k += 8 * sizeof(int8_t);
+      }
+      p -= 3 * sizeof(void*);
+    } while (p != 0);
+
+    const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
+    const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
+    const __m128i vacc1x01 = _mm_hadd_epi32(vacc1x0, vacc1x1);
+    const __m128i vacc1x23 = _mm_hadd_epi32(vacc1x2, vacc1x3);
+    const __m128i vacc2x01 = _mm_hadd_epi32(vacc2x0, vacc2x1);
+    const __m128i vacc2x23 = _mm_hadd_epi32(vacc2x2, vacc2x3);
+
+    __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
+    __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
+    __m128i vacc2x0123 = _mm_hadd_epi32(vacc2x01, vacc2x23);
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+    __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
+
+    const __m128 vscale0123 = _mm_load_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+    vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+    vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+    __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4);
+        c2 += 2;
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-igemm/gen/3x4c8-minmax-fp32-avx-ld64.c b/src/qc8-igemm/gen/3x4c8-minmax-fp32-avx-ld64.c
new file mode 100644
index 0000000..8408090
--- /dev/null
+++ b/src/qc8-igemm/gen/3x4c8-minmax-fp32-avx-ld64.c
@@ -0,0 +1,193 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/MRx4c8-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld64(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 3);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (3 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    c1 = c0;
+  }
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    c2 = c1;
+  }
+
+  do {
+    __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
+    __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
+    __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
+    __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
+    __m128i vacc1x0 = vacc0x0;
+    __m128i vacc1x1 = vacc0x1;
+    __m128i vacc1x2 = vacc0x2;
+    __m128i vacc1x3 = vacc0x3;
+    __m128i vacc2x0 = vacc0x0;
+    __m128i vacc2x1 = vacc0x1;
+    __m128i vacc2x2 = vacc0x2;
+    __m128i vacc2x3 = vacc0x3;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      const int8_t* restrict a2 = a[2];
+      if XNN_UNPREDICTABLE(a2 != zero) {
+        a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
+      }
+      a += 3;
+
+      size_t k = 0;
+      while (k < kc) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+        a0 += 8;
+        const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+        const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+        a1 += 8;
+        const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+        const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
+        a2 += 8;
+
+        const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+        const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
+
+        vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
+        vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
+        vacc2x0 = _mm_add_epi32(vacc2x0, _mm_madd_epi16(vxa2, vxb0));
+        const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8));
+        const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
+
+        vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
+        vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
+        vacc2x1 = _mm_add_epi32(vacc2x1, _mm_madd_epi16(vxa2, vxb1));
+        const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16));
+        const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
+
+        vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
+        vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
+        vacc2x2 = _mm_add_epi32(vacc2x2, _mm_madd_epi16(vxa2, vxb2));
+        const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24));
+        const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
+
+        vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
+        vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
+        vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3));
+
+        w = (const void*) ((uintptr_t) w + 32);
+        k += 8 * sizeof(int8_t);
+      }
+      p -= 3 * sizeof(void*);
+    } while (p != 0);
+
+    const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
+    const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
+    const __m128i vacc1x01 = _mm_hadd_epi32(vacc1x0, vacc1x1);
+    const __m128i vacc1x23 = _mm_hadd_epi32(vacc1x2, vacc1x3);
+    const __m128i vacc2x01 = _mm_hadd_epi32(vacc2x0, vacc2x1);
+    const __m128i vacc2x23 = _mm_hadd_epi32(vacc2x2, vacc2x3);
+
+    __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
+    __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
+    __m128i vacc2x0123 = _mm_hadd_epi32(vacc2x01, vacc2x23);
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+    __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
+
+    const __m128 vscale0123 = _mm_load_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+    vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+    vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+    __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4);
+        c2 += 2;
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-igemm/gen/3x4c8-minmax-fp32-sse2-ld128.c b/src/qc8-igemm/gen/3x4c8-minmax-fp32-sse2-ld128.c
new file mode 100644
index 0000000..1e7423e
--- /dev/null
+++ b/src/qc8-igemm/gen/3x4c8-minmax-fp32-sse2-ld128.c
@@ -0,0 +1,193 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/MRx4c8-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld128(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 3);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (3 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    c1 = c0;
+  }
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    c2 = c1;
+  }
+
+  do {
+    __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
+    __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
+    __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
+    __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
+    __m128i vacc1x0 = vacc0x0;
+    __m128i vacc1x1 = vacc0x1;
+    __m128i vacc1x2 = vacc0x2;
+    __m128i vacc1x3 = vacc0x3;
+    __m128i vacc2x0 = vacc0x0;
+    __m128i vacc2x1 = vacc0x1;
+    __m128i vacc2x2 = vacc0x2;
+    __m128i vacc2x3 = vacc0x3;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      const int8_t* restrict a2 = a[2];
+      if XNN_UNPREDICTABLE(a2 != zero) {
+        a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
+      }
+      a += 3;
+
+      size_t k = 0;
+      while (k < kc) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_unpacklo_epi8(va0, _mm_cmpgt_epi8(_mm_setzero_si128(), va0));
+        a0 += 8;
+        const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+        const __m128i vxa1 = _mm_unpacklo_epi8(va1, _mm_cmpgt_epi8(_mm_setzero_si128(), va1));
+        a1 += 8;
+        const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+        const __m128i vxa2 = _mm_unpacklo_epi8(va2, _mm_cmpgt_epi8(_mm_setzero_si128(), va2));
+        a2 += 8;
+
+        const __m128i vb01 = _mm_load_si128((const __m128i*) w);
+        const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
+        const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
+        const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+
+        vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
+        vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
+        vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
+        vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
+        vacc2x0 = _mm_add_epi32(vacc2x0, _mm_madd_epi16(vxa2, vxb0));
+        vacc2x1 = _mm_add_epi32(vacc2x1, _mm_madd_epi16(vxa2, vxb1));
+        const __m128i vb23 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16));
+        const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
+        const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
+        const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+
+        vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
+        vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
+        vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
+        vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
+        vacc2x2 = _mm_add_epi32(vacc2x2, _mm_madd_epi16(vxa2, vxb2));
+        vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3));
+
+        w = (const void*) ((uintptr_t) w + 32);
+        k += 8 * sizeof(int8_t);
+      }
+      p -= 3 * sizeof(void*);
+    } while (p != 0);
+
+    const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2));
+    const __m128i vacc0x13 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x1, vacc0x3), _mm_unpackhi_epi32(vacc0x1, vacc0x3));
+    const __m128i vacc1x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc1x0, vacc1x2), _mm_unpackhi_epi32(vacc1x0, vacc1x2));
+    const __m128i vacc1x13 = _mm_add_epi32(_mm_unpacklo_epi32(vacc1x1, vacc1x3), _mm_unpackhi_epi32(vacc1x1, vacc1x3));
+    const __m128i vacc2x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc2x0, vacc2x2), _mm_unpackhi_epi32(vacc2x0, vacc2x2));
+    const __m128i vacc2x13 = _mm_add_epi32(_mm_unpacklo_epi32(vacc2x1, vacc2x3), _mm_unpackhi_epi32(vacc2x1, vacc2x3));
+
+    __m128i vacc0x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x02, vacc0x13), _mm_unpackhi_epi32(vacc0x02, vacc0x13));
+    __m128i vacc1x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc1x02, vacc1x13), _mm_unpackhi_epi32(vacc1x02, vacc1x13));
+    __m128i vacc2x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc2x02, vacc2x13), _mm_unpackhi_epi32(vacc2x02, vacc2x13));
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+    __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
+
+    const __m128 vscale0123 = _mm_load_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+    vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+    vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+    __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
+
+    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
+    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
+    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
+    vacc22x0123 = _mm_min_epi16(_mm_max_epi16(vacc22x0123, voutput_min), voutput_max);
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
+
+
+    if (nc >= 4) {
+      *((uint32_t*) c2) = (uint32_t) _mm_cvtsi128_si32(_mm_shuffle_epi32(vout, _MM_SHUFFLE(2, 2, 2, 2)));
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(_mm_shuffle_epi32(vout, _MM_SHUFFLE(1, 1, 1, 1)));
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4);
+        c2 += 2;
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
+        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-igemm/gen/3x4c8-minmax-fp32-sse2-ld64.c b/src/qc8-igemm/gen/3x4c8-minmax-fp32-sse2-ld64.c
new file mode 100644
index 0000000..c84c7cd
--- /dev/null
+++ b/src/qc8-igemm/gen/3x4c8-minmax-fp32-sse2-ld64.c
@@ -0,0 +1,195 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/MRx4c8-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 3);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (3 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    c1 = c0;
+  }
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    c2 = c1;
+  }
+
+  do {
+    __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
+    __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
+    __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
+    __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
+    __m128i vacc1x0 = vacc0x0;
+    __m128i vacc1x1 = vacc0x1;
+    __m128i vacc1x2 = vacc0x2;
+    __m128i vacc1x3 = vacc0x3;
+    __m128i vacc2x0 = vacc0x0;
+    __m128i vacc2x1 = vacc0x1;
+    __m128i vacc2x2 = vacc0x2;
+    __m128i vacc2x3 = vacc0x3;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      const int8_t* restrict a2 = a[2];
+      if XNN_UNPREDICTABLE(a2 != zero) {
+        a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
+      }
+      a += 3;
+
+      size_t k = 0;
+      while (k < kc) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_unpacklo_epi8(va0, _mm_cmpgt_epi8(_mm_setzero_si128(), va0));
+        a0 += 8;
+        const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+        const __m128i vxa1 = _mm_unpacklo_epi8(va1, _mm_cmpgt_epi8(_mm_setzero_si128(), va1));
+        a1 += 8;
+        const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+        const __m128i vxa2 = _mm_unpacklo_epi8(va2, _mm_cmpgt_epi8(_mm_setzero_si128(), va2));
+        a2 += 8;
+
+        const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+        const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
+
+        vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
+        vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
+        vacc2x0 = _mm_add_epi32(vacc2x0, _mm_madd_epi16(vxa2, vxb0));
+        const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8));
+        const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
+
+        vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
+        vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
+        vacc2x1 = _mm_add_epi32(vacc2x1, _mm_madd_epi16(vxa2, vxb1));
+        const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16));
+        const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
+
+        vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
+        vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
+        vacc2x2 = _mm_add_epi32(vacc2x2, _mm_madd_epi16(vxa2, vxb2));
+        const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24));
+        const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
+
+        vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
+        vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
+        vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3));
+
+        w = (const void*) ((uintptr_t) w + 32);
+        k += 8 * sizeof(int8_t);
+      }
+      p -= 3 * sizeof(void*);
+    } while (p != 0);
+
+    const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2));
+    const __m128i vacc0x13 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x1, vacc0x3), _mm_unpackhi_epi32(vacc0x1, vacc0x3));
+    const __m128i vacc1x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc1x0, vacc1x2), _mm_unpackhi_epi32(vacc1x0, vacc1x2));
+    const __m128i vacc1x13 = _mm_add_epi32(_mm_unpacklo_epi32(vacc1x1, vacc1x3), _mm_unpackhi_epi32(vacc1x1, vacc1x3));
+    const __m128i vacc2x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc2x0, vacc2x2), _mm_unpackhi_epi32(vacc2x0, vacc2x2));
+    const __m128i vacc2x13 = _mm_add_epi32(_mm_unpacklo_epi32(vacc2x1, vacc2x3), _mm_unpackhi_epi32(vacc2x1, vacc2x3));
+
+    __m128i vacc0x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x02, vacc0x13), _mm_unpackhi_epi32(vacc0x02, vacc0x13));
+    __m128i vacc1x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc1x02, vacc1x13), _mm_unpackhi_epi32(vacc1x02, vacc1x13));
+    __m128i vacc2x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc2x02, vacc2x13), _mm_unpackhi_epi32(vacc2x02, vacc2x13));
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+    __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
+
+    const __m128 vscale0123 = _mm_load_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+    vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+    vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+    __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
+
+    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
+    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
+    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
+    vacc22x0123 = _mm_min_epi16(_mm_max_epi16(vacc22x0123, voutput_min), voutput_max);
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
+
+
+    if (nc >= 4) {
+      *((uint32_t*) c2) = (uint32_t) _mm_cvtsi128_si32(_mm_shuffle_epi32(vout, _MM_SHUFFLE(2, 2, 2, 2)));
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(_mm_shuffle_epi32(vout, _MM_SHUFFLE(1, 1, 1, 1)));
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4);
+        c2 += 2;
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
+        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-igemm/gen/3x4c8-minmax-fp32-sse41-ld128.c b/src/qc8-igemm/gen/3x4c8-minmax-fp32-sse41-ld128.c
new file mode 100644
index 0000000..39f2ca5
--- /dev/null
+++ b/src/qc8-igemm/gen/3x4c8-minmax-fp32-sse41-ld128.c
@@ -0,0 +1,191 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/MRx4c8-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld128(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 3);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (3 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    c1 = c0;
+  }
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    c2 = c1;
+  }
+
+  do {
+    __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
+    __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
+    __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
+    __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
+    __m128i vacc1x0 = vacc0x0;
+    __m128i vacc1x1 = vacc0x1;
+    __m128i vacc1x2 = vacc0x2;
+    __m128i vacc1x3 = vacc0x3;
+    __m128i vacc2x0 = vacc0x0;
+    __m128i vacc2x1 = vacc0x1;
+    __m128i vacc2x2 = vacc0x2;
+    __m128i vacc2x3 = vacc0x3;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      const int8_t* restrict a2 = a[2];
+      if XNN_UNPREDICTABLE(a2 != zero) {
+        a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
+      }
+      a += 3;
+
+      size_t k = 0;
+      while (k < kc) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+        a0 += 8;
+        const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+        const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+        a1 += 8;
+        const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+        const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
+        a2 += 8;
+
+        const __m128i vb01 = _mm_load_si128((const __m128i*) w);
+        const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
+        const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
+        const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+
+        vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
+        vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
+        vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
+        vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
+        vacc2x0 = _mm_add_epi32(vacc2x0, _mm_madd_epi16(vxa2, vxb0));
+        vacc2x1 = _mm_add_epi32(vacc2x1, _mm_madd_epi16(vxa2, vxb1));
+        const __m128i vb23 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16));
+        const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
+        const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
+        const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+
+        vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
+        vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
+        vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
+        vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
+        vacc2x2 = _mm_add_epi32(vacc2x2, _mm_madd_epi16(vxa2, vxb2));
+        vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3));
+
+        w = (const void*) ((uintptr_t) w + 32);
+        k += 8 * sizeof(int8_t);
+      }
+      p -= 3 * sizeof(void*);
+    } while (p != 0);
+
+    const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
+    const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
+    const __m128i vacc1x01 = _mm_hadd_epi32(vacc1x0, vacc1x1);
+    const __m128i vacc1x23 = _mm_hadd_epi32(vacc1x2, vacc1x3);
+    const __m128i vacc2x01 = _mm_hadd_epi32(vacc2x0, vacc2x1);
+    const __m128i vacc2x23 = _mm_hadd_epi32(vacc2x2, vacc2x3);
+
+    __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
+    __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
+    __m128i vacc2x0123 = _mm_hadd_epi32(vacc2x01, vacc2x23);
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+    __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
+
+    const __m128 vscale0123 = _mm_load_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+    vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+    vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+    __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4);
+        c2 += 2;
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-igemm/gen/3x4c8-minmax-fp32-sse41-ld64.c b/src/qc8-igemm/gen/3x4c8-minmax-fp32-sse41-ld64.c
new file mode 100644
index 0000000..6a4b9d4
--- /dev/null
+++ b/src/qc8-igemm/gen/3x4c8-minmax-fp32-sse41-ld64.c
@@ -0,0 +1,193 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/MRx4c8-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 3);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (3 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    c1 = c0;
+  }
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    c2 = c1;
+  }
+
+  do {
+    __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
+    __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
+    __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
+    __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
+    __m128i vacc1x0 = vacc0x0;
+    __m128i vacc1x1 = vacc0x1;
+    __m128i vacc1x2 = vacc0x2;
+    __m128i vacc1x3 = vacc0x3;
+    __m128i vacc2x0 = vacc0x0;
+    __m128i vacc2x1 = vacc0x1;
+    __m128i vacc2x2 = vacc0x2;
+    __m128i vacc2x3 = vacc0x3;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      const int8_t* restrict a2 = a[2];
+      if XNN_UNPREDICTABLE(a2 != zero) {
+        a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
+      }
+      a += 3;
+
+      size_t k = 0;
+      while (k < kc) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+        a0 += 8;
+        const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+        const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+        a1 += 8;
+        const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+        const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
+        a2 += 8;
+
+        const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+        const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
+
+        vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
+        vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
+        vacc2x0 = _mm_add_epi32(vacc2x0, _mm_madd_epi16(vxa2, vxb0));
+        const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8));
+        const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
+
+        vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
+        vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
+        vacc2x1 = _mm_add_epi32(vacc2x1, _mm_madd_epi16(vxa2, vxb1));
+        const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16));
+        const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
+
+        vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
+        vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
+        vacc2x2 = _mm_add_epi32(vacc2x2, _mm_madd_epi16(vxa2, vxb2));
+        const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24));
+        const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
+
+        vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
+        vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
+        vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3));
+
+        w = (const void*) ((uintptr_t) w + 32);
+        k += 8 * sizeof(int8_t);
+      }
+      p -= 3 * sizeof(void*);
+    } while (p != 0);
+
+    const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
+    const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
+    const __m128i vacc1x01 = _mm_hadd_epi32(vacc1x0, vacc1x1);
+    const __m128i vacc1x23 = _mm_hadd_epi32(vacc1x2, vacc1x3);
+    const __m128i vacc2x01 = _mm_hadd_epi32(vacc2x0, vacc2x1);
+    const __m128i vacc2x23 = _mm_hadd_epi32(vacc2x2, vacc2x3);
+
+    __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
+    __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
+    __m128i vacc2x0123 = _mm_hadd_epi32(vacc2x01, vacc2x23);
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+    __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
+
+    const __m128 vscale0123 = _mm_load_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+    vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+    vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+    __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4);
+        c2 += 2;
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-igemm/gen/3x4c8-minmax-fp32-xop-ld128.c b/src/qc8-igemm/gen/3x4c8-minmax-fp32-xop-ld128.c
new file mode 100644
index 0000000..4369f6d
--- /dev/null
+++ b/src/qc8-igemm/gen/3x4c8-minmax-fp32-xop-ld128.c
@@ -0,0 +1,196 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/MRx4c8-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#if defined(__GNUC__) || defined(__clang__)
+  #include <x86intrin.h>
+#else
+  #include <immintrin.h>
+  #include <ammintrin.h>
+#endif
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld128(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 3);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (3 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    c1 = c0;
+  }
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    c2 = c1;
+  }
+
+  do {
+    __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
+    __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
+    __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
+    __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
+    __m128i vacc1x0 = vacc0x0;
+    __m128i vacc1x1 = vacc0x1;
+    __m128i vacc1x2 = vacc0x2;
+    __m128i vacc1x3 = vacc0x3;
+    __m128i vacc2x0 = vacc0x0;
+    __m128i vacc2x1 = vacc0x1;
+    __m128i vacc2x2 = vacc0x2;
+    __m128i vacc2x3 = vacc0x3;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      const int8_t* restrict a2 = a[2];
+      if XNN_UNPREDICTABLE(a2 != zero) {
+        a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
+      }
+      a += 3;
+
+      size_t k = 0;
+      while (k < kc) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+        a0 += 8;
+        const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+        const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+        a1 += 8;
+        const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+        const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
+        a2 += 8;
+
+        const __m128i vb01 = _mm_load_si128((const __m128i*) w);
+        const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
+        const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
+        const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+
+        vacc0x0 = _mm_maddd_epi16(vxa0, vxb0, vacc0x0);
+        vacc0x1 = _mm_maddd_epi16(vxa0, vxb1, vacc0x1);
+        vacc1x0 = _mm_maddd_epi16(vxa1, vxb0, vacc1x0);
+        vacc1x1 = _mm_maddd_epi16(vxa1, vxb1, vacc1x1);
+        vacc2x0 = _mm_maddd_epi16(vxa2, vxb0, vacc2x0);
+        vacc2x1 = _mm_maddd_epi16(vxa2, vxb1, vacc2x1);
+        const __m128i vb23 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16));
+        const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
+        const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
+        const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+
+        vacc0x2 = _mm_maddd_epi16(vxa0, vxb2, vacc0x2);
+        vacc0x3 = _mm_maddd_epi16(vxa0, vxb3, vacc0x3);
+        vacc1x2 = _mm_maddd_epi16(vxa1, vxb2, vacc1x2);
+        vacc1x3 = _mm_maddd_epi16(vxa1, vxb3, vacc1x3);
+        vacc2x2 = _mm_maddd_epi16(vxa2, vxb2, vacc2x2);
+        vacc2x3 = _mm_maddd_epi16(vxa2, vxb3, vacc2x3);
+
+        w = (const void*) ((uintptr_t) w + 32);
+        k += 8 * sizeof(int8_t);
+      }
+      p -= 3 * sizeof(void*);
+    } while (p != 0);
+
+    const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
+    const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
+    const __m128i vacc1x01 = _mm_hadd_epi32(vacc1x0, vacc1x1);
+    const __m128i vacc1x23 = _mm_hadd_epi32(vacc1x2, vacc1x3);
+    const __m128i vacc2x01 = _mm_hadd_epi32(vacc2x0, vacc2x1);
+    const __m128i vacc2x23 = _mm_hadd_epi32(vacc2x2, vacc2x3);
+
+    __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
+    __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
+    __m128i vacc2x0123 = _mm_hadd_epi32(vacc2x01, vacc2x23);
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+    __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
+
+    const __m128 vscale0123 = _mm_load_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+    vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+    vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+    __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4);
+        c2 += 2;
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-igemm/gen/3x4c8-minmax-fp32-xop-ld64.c b/src/qc8-igemm/gen/3x4c8-minmax-fp32-xop-ld64.c
new file mode 100644
index 0000000..0433e8f
--- /dev/null
+++ b/src/qc8-igemm/gen/3x4c8-minmax-fp32-xop-ld64.c
@@ -0,0 +1,198 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/MRx4c8-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#if defined(__GNUC__) || defined(__clang__)
+  #include <x86intrin.h>
+#else
+  #include <immintrin.h>
+  #include <ammintrin.h>
+#endif
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld64(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 3);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (3 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    c1 = c0;
+  }
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    c2 = c1;
+  }
+
+  do {
+    __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
+    __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
+    __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
+    __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
+    __m128i vacc1x0 = vacc0x0;
+    __m128i vacc1x1 = vacc0x1;
+    __m128i vacc1x2 = vacc0x2;
+    __m128i vacc1x3 = vacc0x3;
+    __m128i vacc2x0 = vacc0x0;
+    __m128i vacc2x1 = vacc0x1;
+    __m128i vacc2x2 = vacc0x2;
+    __m128i vacc2x3 = vacc0x3;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      const int8_t* restrict a2 = a[2];
+      if XNN_UNPREDICTABLE(a2 != zero) {
+        a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
+      }
+      a += 3;
+
+      size_t k = 0;
+      while (k < kc) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+        a0 += 8;
+        const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+        const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+        a1 += 8;
+        const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+        const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
+        a2 += 8;
+
+        const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+        const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
+
+        vacc0x0 = _mm_maddd_epi16(vxa0, vxb0, vacc0x0);
+        vacc1x0 = _mm_maddd_epi16(vxa1, vxb0, vacc1x0);
+        vacc2x0 = _mm_maddd_epi16(vxa2, vxb0, vacc2x0);
+        const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8));
+        const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
+
+        vacc0x1 = _mm_maddd_epi16(vxa0, vxb1, vacc0x1);
+        vacc1x1 = _mm_maddd_epi16(vxa1, vxb1, vacc1x1);
+        vacc2x1 = _mm_maddd_epi16(vxa2, vxb1, vacc2x1);
+        const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16));
+        const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
+
+        vacc0x2 = _mm_maddd_epi16(vxa0, vxb2, vacc0x2);
+        vacc1x2 = _mm_maddd_epi16(vxa1, vxb2, vacc1x2);
+        vacc2x2 = _mm_maddd_epi16(vxa2, vxb2, vacc2x2);
+        const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24));
+        const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
+
+        vacc0x3 = _mm_maddd_epi16(vxa0, vxb3, vacc0x3);
+        vacc1x3 = _mm_maddd_epi16(vxa1, vxb3, vacc1x3);
+        vacc2x3 = _mm_maddd_epi16(vxa2, vxb3, vacc2x3);
+
+        w = (const void*) ((uintptr_t) w + 32);
+        k += 8 * sizeof(int8_t);
+      }
+      p -= 3 * sizeof(void*);
+    } while (p != 0);
+
+    const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
+    const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
+    const __m128i vacc1x01 = _mm_hadd_epi32(vacc1x0, vacc1x1);
+    const __m128i vacc1x23 = _mm_hadd_epi32(vacc1x2, vacc1x3);
+    const __m128i vacc2x01 = _mm_hadd_epi32(vacc2x0, vacc2x1);
+    const __m128i vacc2x23 = _mm_hadd_epi32(vacc2x2, vacc2x3);
+
+    __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
+    __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
+    __m128i vacc2x0123 = _mm_hadd_epi32(vacc2x01, vacc2x23);
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+    __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
+
+    const __m128 vscale0123 = _mm_load_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+    vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+    vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+    __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4);
+        c2 += 2;
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-igemm/gen/4x4c2-minmax-fp32-avx-ld128.c b/src/qc8-igemm/gen/4x4c2-minmax-fp32-avx-ld128.c
new file mode 100644
index 0000000..9118bd1
--- /dev/null
+++ b/src/qc8-igemm/gen/4x4c2-minmax-fp32-avx-ld128.c
@@ -0,0 +1,270 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/MRx4c2-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 4);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (4 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    c1 = c0;
+  }
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    c2 = c1;
+  }
+  int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 4) {
+    c3 = c2;
+  }
+
+  do {
+    __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
+    __m128i vacc1x0123 = vacc0x0123;
+    __m128i vacc2x0123 = vacc0x0123;
+    __m128i vacc3x0123 = vacc0x0123;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      const int8_t* restrict a2 = a[2];
+      if XNN_UNPREDICTABLE(a2 != zero) {
+        a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
+      }
+      const int8_t* restrict a3 = a[3];
+      if XNN_UNPREDICTABLE(a3 != zero) {
+        a3 = (const int8_t*) ((uintptr_t) a3 + a_offset);
+      }
+      a += 4;
+
+      size_t k = kc;
+      while (k >= 8 * sizeof(int8_t)) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+        a0 += 8;
+        const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+        const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+        a1 += 8;
+        const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+        const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
+        a2 += 8;
+        const __m128i va3 = _mm_loadl_epi64((const __m128i*) a3);
+        const __m128i vxa3 = _mm_cvtepi8_epi16(va3);
+        a3 += 8;
+
+        const __m128i vb01 = _mm_loadu_si128((const __m128i*) w);
+        const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
+        const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
+        const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        vacc2x0123 = _mm_add_epi32(vacc2x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        vacc3x0123 = _mm_add_epi32(vacc3x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        vacc2x0123 = _mm_add_epi32(vacc2x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        vacc3x0123 = _mm_add_epi32(vacc3x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16));
+        const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
+        const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
+        const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+        vacc2x0123 = _mm_add_epi32(vacc2x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+        vacc3x0123 = _mm_add_epi32(vacc3x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+        vacc2x0123 = _mm_add_epi32(vacc2x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+        vacc3x0123 = _mm_add_epi32(vacc3x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+
+        w = (const void*) ((uintptr_t) w + 32);
+        k -= 8 * sizeof(int8_t);
+      }
+      if (k != 0) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+        a0 = (const int8_t*) ((uintptr_t) a0 + k);
+        const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+        const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+        a1 = (const int8_t*) ((uintptr_t) a1 + k);
+        const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+        const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
+        a2 = (const int8_t*) ((uintptr_t) a2 + k);
+        const __m128i va3 = _mm_loadl_epi64((const __m128i*) a3);
+        const __m128i vxa3 = _mm_cvtepi8_epi16(va3);
+        a3 = (const int8_t*) ((uintptr_t) a3 + k);
+
+        const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+        w = (const void*) ((uintptr_t) w + 8);
+        const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        vacc2x0123 = _mm_add_epi32(vacc2x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        vacc3x0123 = _mm_add_epi32(vacc3x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+        if (k > 2 * sizeof(int8_t)) {
+          const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
+          w = (const void*) ((uintptr_t) w + 8);
+          const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
+
+          vacc0x0123 = _mm_add_epi32(vacc0x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+          vacc1x0123 = _mm_add_epi32(vacc1x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+          vacc2x0123 = _mm_add_epi32(vacc2x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+          vacc3x0123 = _mm_add_epi32(vacc3x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+
+          if (k > 4 * sizeof(int8_t)) {
+            const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
+            w = (const void*) ((uintptr_t) w + 8);
+            const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
+
+            vacc0x0123 = _mm_add_epi32(vacc0x0123,
+              _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+            vacc1x0123 = _mm_add_epi32(vacc1x0123,
+              _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+            vacc2x0123 = _mm_add_epi32(vacc2x0123,
+              _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+            vacc3x0123 = _mm_add_epi32(vacc3x0123,
+              _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+          }
+        }
+      }
+      p -= 4 * sizeof(void*);
+    } while (p != 0);
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+    __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
+    __m128 vscaled3x0123 = _mm_cvtepi32_ps(vacc3x0123);
+
+    const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+    vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale0123);
+    vscaled3x0123 = _mm_mul_ps(vscaled3x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+    vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
+    vacc3x0123 = _mm_cvtps_epi32(vscaled3x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+    __m128i vacc23x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc3x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc23x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c3) = (uint32_t) _mm_extract_epi32(vout, 3);
+      c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
+      *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c3) = (uint16_t) _mm_extract_epi16(vout, 6);
+        c3 += 2;
+        *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4);
+        c2 += 2;
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c3) = (int8_t) _mm_extract_epi8(vout, 12);
+        *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-igemm/gen/4x4c2-minmax-fp32-avx-ld64.c b/src/qc8-igemm/gen/4x4c2-minmax-fp32-avx-ld64.c
new file mode 100644
index 0000000..affc814
--- /dev/null
+++ b/src/qc8-igemm/gen/4x4c2-minmax-fp32-avx-ld64.c
@@ -0,0 +1,270 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/MRx4c2-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld64(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 4);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (4 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    c1 = c0;
+  }
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    c2 = c1;
+  }
+  int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 4) {
+    c3 = c2;
+  }
+
+  do {
+    __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
+    __m128i vacc1x0123 = vacc0x0123;
+    __m128i vacc2x0123 = vacc0x0123;
+    __m128i vacc3x0123 = vacc0x0123;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      const int8_t* restrict a2 = a[2];
+      if XNN_UNPREDICTABLE(a2 != zero) {
+        a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
+      }
+      const int8_t* restrict a3 = a[3];
+      if XNN_UNPREDICTABLE(a3 != zero) {
+        a3 = (const int8_t*) ((uintptr_t) a3 + a_offset);
+      }
+      a += 4;
+
+      size_t k = kc;
+      while (k >= 8 * sizeof(int8_t)) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+        a0 += 8;
+        const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+        const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+        a1 += 8;
+        const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+        const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
+        a2 += 8;
+        const __m128i va3 = _mm_loadl_epi64((const __m128i*) a3);
+        const __m128i vxa3 = _mm_cvtepi8_epi16(va3);
+        a3 += 8;
+
+        const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+        const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        vacc2x0123 = _mm_add_epi32(vacc2x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        vacc3x0123 = _mm_add_epi32(vacc3x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8));
+        const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        vacc2x0123 = _mm_add_epi32(vacc2x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        vacc3x0123 = _mm_add_epi32(vacc3x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16));
+        const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+        vacc2x0123 = _mm_add_epi32(vacc2x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+        vacc3x0123 = _mm_add_epi32(vacc3x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+        const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24));
+        const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+        vacc2x0123 = _mm_add_epi32(vacc2x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+        vacc3x0123 = _mm_add_epi32(vacc3x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+
+        w = (const void*) ((uintptr_t) w + 32);
+        k -= 8 * sizeof(int8_t);
+      }
+      if (k != 0) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+        a0 = (const int8_t*) ((uintptr_t) a0 + k);
+        const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+        const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+        a1 = (const int8_t*) ((uintptr_t) a1 + k);
+        const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+        const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
+        a2 = (const int8_t*) ((uintptr_t) a2 + k);
+        const __m128i va3 = _mm_loadl_epi64((const __m128i*) a3);
+        const __m128i vxa3 = _mm_cvtepi8_epi16(va3);
+        a3 = (const int8_t*) ((uintptr_t) a3 + k);
+
+        const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+        w = (const void*) ((uintptr_t) w + 8);
+        const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        vacc2x0123 = _mm_add_epi32(vacc2x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        vacc3x0123 = _mm_add_epi32(vacc3x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+        if (k > 2 * sizeof(int8_t)) {
+          const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
+          w = (const void*) ((uintptr_t) w + 8);
+          const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
+
+          vacc0x0123 = _mm_add_epi32(vacc0x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+          vacc1x0123 = _mm_add_epi32(vacc1x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+          vacc2x0123 = _mm_add_epi32(vacc2x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+          vacc3x0123 = _mm_add_epi32(vacc3x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+
+          if (k > 4 * sizeof(int8_t)) {
+            const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
+            w = (const void*) ((uintptr_t) w + 8);
+            const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
+
+            vacc0x0123 = _mm_add_epi32(vacc0x0123,
+              _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+            vacc1x0123 = _mm_add_epi32(vacc1x0123,
+              _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+            vacc2x0123 = _mm_add_epi32(vacc2x0123,
+              _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+            vacc3x0123 = _mm_add_epi32(vacc3x0123,
+              _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+          }
+        }
+      }
+      p -= 4 * sizeof(void*);
+    } while (p != 0);
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+    __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
+    __m128 vscaled3x0123 = _mm_cvtepi32_ps(vacc3x0123);
+
+    const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+    vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale0123);
+    vscaled3x0123 = _mm_mul_ps(vscaled3x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+    vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
+    vacc3x0123 = _mm_cvtps_epi32(vscaled3x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+    __m128i vacc23x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc3x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc23x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c3) = (uint32_t) _mm_extract_epi32(vout, 3);
+      c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
+      *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c3) = (uint16_t) _mm_extract_epi16(vout, 6);
+        c3 += 2;
+        *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4);
+        c2 += 2;
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c3) = (int8_t) _mm_extract_epi8(vout, 12);
+        *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-igemm/gen/4x4c2-minmax-fp32-sse2-ld128.c b/src/qc8-igemm/gen/4x4c2-minmax-fp32-sse2-ld128.c
new file mode 100644
index 0000000..cb05a57
--- /dev/null
+++ b/src/qc8-igemm/gen/4x4c2-minmax-fp32-sse2-ld128.c
@@ -0,0 +1,272 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/MRx4c2-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld128(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 4);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (4 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    c1 = c0;
+  }
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    c2 = c1;
+  }
+  int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 4) {
+    c3 = c2;
+  }
+
+  do {
+    __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
+    __m128i vacc1x0123 = vacc0x0123;
+    __m128i vacc2x0123 = vacc0x0123;
+    __m128i vacc3x0123 = vacc0x0123;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      const int8_t* restrict a2 = a[2];
+      if XNN_UNPREDICTABLE(a2 != zero) {
+        a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
+      }
+      const int8_t* restrict a3 = a[3];
+      if XNN_UNPREDICTABLE(a3 != zero) {
+        a3 = (const int8_t*) ((uintptr_t) a3 + a_offset);
+      }
+      a += 4;
+
+      size_t k = kc;
+      while (k >= 8 * sizeof(int8_t)) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_unpacklo_epi8(va0, _mm_cmpgt_epi8(_mm_setzero_si128(), va0));
+        a0 += 8;
+        const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+        const __m128i vxa1 = _mm_unpacklo_epi8(va1, _mm_cmpgt_epi8(_mm_setzero_si128(), va1));
+        a1 += 8;
+        const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+        const __m128i vxa2 = _mm_unpacklo_epi8(va2, _mm_cmpgt_epi8(_mm_setzero_si128(), va2));
+        a2 += 8;
+        const __m128i va3 = _mm_loadl_epi64((const __m128i*) a3);
+        const __m128i vxa3 = _mm_unpacklo_epi8(va3, _mm_cmpgt_epi8(_mm_setzero_si128(), va3));
+        a3 += 8;
+
+        const __m128i vb01 = _mm_loadu_si128((const __m128i*) w);
+        const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
+        const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
+        const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        vacc2x0123 = _mm_add_epi32(vacc2x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        vacc3x0123 = _mm_add_epi32(vacc3x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        vacc2x0123 = _mm_add_epi32(vacc2x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        vacc3x0123 = _mm_add_epi32(vacc3x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16));
+        const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
+        const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
+        const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+        vacc2x0123 = _mm_add_epi32(vacc2x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+        vacc3x0123 = _mm_add_epi32(vacc3x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+        vacc2x0123 = _mm_add_epi32(vacc2x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+        vacc3x0123 = _mm_add_epi32(vacc3x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+
+        w = (const void*) ((uintptr_t) w + 32);
+        k -= 8 * sizeof(int8_t);
+      }
+      if (k != 0) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_unpacklo_epi8(va0, _mm_cmpgt_epi8(_mm_setzero_si128(), va0));
+        a0 = (const int8_t*) ((uintptr_t) a0 + k);
+        const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+        const __m128i vxa1 = _mm_unpacklo_epi8(va1, _mm_cmpgt_epi8(_mm_setzero_si128(), va1));
+        a1 = (const int8_t*) ((uintptr_t) a1 + k);
+        const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+        const __m128i vxa2 = _mm_unpacklo_epi8(va2, _mm_cmpgt_epi8(_mm_setzero_si128(), va2));
+        a2 = (const int8_t*) ((uintptr_t) a2 + k);
+        const __m128i va3 = _mm_loadl_epi64((const __m128i*) a3);
+        const __m128i vxa3 = _mm_unpacklo_epi8(va3, _mm_cmpgt_epi8(_mm_setzero_si128(), va3));
+        a3 = (const int8_t*) ((uintptr_t) a3 + k);
+
+        const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+        w = (const void*) ((uintptr_t) w + 8);
+        const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        vacc2x0123 = _mm_add_epi32(vacc2x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        vacc3x0123 = _mm_add_epi32(vacc3x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+        if (k > 2 * sizeof(int8_t)) {
+          const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
+          w = (const void*) ((uintptr_t) w + 8);
+          const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
+
+          vacc0x0123 = _mm_add_epi32(vacc0x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+          vacc1x0123 = _mm_add_epi32(vacc1x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+          vacc2x0123 = _mm_add_epi32(vacc2x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+          vacc3x0123 = _mm_add_epi32(vacc3x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+
+          if (k > 4 * sizeof(int8_t)) {
+            const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
+            w = (const void*) ((uintptr_t) w + 8);
+            const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
+
+            vacc0x0123 = _mm_add_epi32(vacc0x0123,
+              _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+            vacc1x0123 = _mm_add_epi32(vacc1x0123,
+              _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+            vacc2x0123 = _mm_add_epi32(vacc2x0123,
+              _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+            vacc3x0123 = _mm_add_epi32(vacc3x0123,
+              _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+          }
+        }
+      }
+      p -= 4 * sizeof(void*);
+    } while (p != 0);
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+    __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
+    __m128 vscaled3x0123 = _mm_cvtepi32_ps(vacc3x0123);
+
+    const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+    vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale0123);
+    vscaled3x0123 = _mm_mul_ps(vscaled3x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+    vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
+    vacc3x0123 = _mm_cvtps_epi32(vscaled3x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+    __m128i vacc23x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc3x0123), voutput_zero_point);
+
+    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
+    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
+    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
+    vacc23x0123 = _mm_min_epi16(_mm_max_epi16(vacc23x0123, voutput_min), voutput_max);
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc23x0123);
+
+
+    if (nc >= 4) {
+      *((uint32_t*) c3) = (uint32_t) _mm_cvtsi128_si32(_mm_shuffle_epi32(vout, _MM_SHUFFLE(3, 3, 3, 3)));
+      c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
+      *((uint32_t*) c2) = (uint32_t) _mm_cvtsi128_si32(_mm_shuffle_epi32(vout, _MM_SHUFFLE(2, 2, 2, 2)));
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(_mm_shuffle_epi32(vout, _MM_SHUFFLE(1, 1, 1, 1)));
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c3) = (uint16_t) _mm_extract_epi16(vout, 6);
+        c3 += 2;
+        *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4);
+        c2 += 2;
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c3) = (int8_t) _mm_extract_epi16(vout, 6);
+        *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
+        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-igemm/gen/4x4c2-minmax-fp32-sse2-ld64.c b/src/qc8-igemm/gen/4x4c2-minmax-fp32-sse2-ld64.c
new file mode 100644
index 0000000..95bfccf
--- /dev/null
+++ b/src/qc8-igemm/gen/4x4c2-minmax-fp32-sse2-ld64.c
@@ -0,0 +1,272 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/MRx4c2-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld64(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 4);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (4 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    c1 = c0;
+  }
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    c2 = c1;
+  }
+  int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 4) {
+    c3 = c2;
+  }
+
+  do {
+    __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
+    __m128i vacc1x0123 = vacc0x0123;
+    __m128i vacc2x0123 = vacc0x0123;
+    __m128i vacc3x0123 = vacc0x0123;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      const int8_t* restrict a2 = a[2];
+      if XNN_UNPREDICTABLE(a2 != zero) {
+        a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
+      }
+      const int8_t* restrict a3 = a[3];
+      if XNN_UNPREDICTABLE(a3 != zero) {
+        a3 = (const int8_t*) ((uintptr_t) a3 + a_offset);
+      }
+      a += 4;
+
+      size_t k = kc;
+      while (k >= 8 * sizeof(int8_t)) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_unpacklo_epi8(va0, _mm_cmpgt_epi8(_mm_setzero_si128(), va0));
+        a0 += 8;
+        const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+        const __m128i vxa1 = _mm_unpacklo_epi8(va1, _mm_cmpgt_epi8(_mm_setzero_si128(), va1));
+        a1 += 8;
+        const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+        const __m128i vxa2 = _mm_unpacklo_epi8(va2, _mm_cmpgt_epi8(_mm_setzero_si128(), va2));
+        a2 += 8;
+        const __m128i va3 = _mm_loadl_epi64((const __m128i*) a3);
+        const __m128i vxa3 = _mm_unpacklo_epi8(va3, _mm_cmpgt_epi8(_mm_setzero_si128(), va3));
+        a3 += 8;
+
+        const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+        const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        vacc2x0123 = _mm_add_epi32(vacc2x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        vacc3x0123 = _mm_add_epi32(vacc3x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8));
+        const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        vacc2x0123 = _mm_add_epi32(vacc2x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        vacc3x0123 = _mm_add_epi32(vacc3x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16));
+        const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+        vacc2x0123 = _mm_add_epi32(vacc2x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+        vacc3x0123 = _mm_add_epi32(vacc3x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+        const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24));
+        const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+        vacc2x0123 = _mm_add_epi32(vacc2x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+        vacc3x0123 = _mm_add_epi32(vacc3x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+
+        w = (const void*) ((uintptr_t) w + 32);
+        k -= 8 * sizeof(int8_t);
+      }
+      if (k != 0) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_unpacklo_epi8(va0, _mm_cmpgt_epi8(_mm_setzero_si128(), va0));
+        a0 = (const int8_t*) ((uintptr_t) a0 + k);
+        const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+        const __m128i vxa1 = _mm_unpacklo_epi8(va1, _mm_cmpgt_epi8(_mm_setzero_si128(), va1));
+        a1 = (const int8_t*) ((uintptr_t) a1 + k);
+        const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+        const __m128i vxa2 = _mm_unpacklo_epi8(va2, _mm_cmpgt_epi8(_mm_setzero_si128(), va2));
+        a2 = (const int8_t*) ((uintptr_t) a2 + k);
+        const __m128i va3 = _mm_loadl_epi64((const __m128i*) a3);
+        const __m128i vxa3 = _mm_unpacklo_epi8(va3, _mm_cmpgt_epi8(_mm_setzero_si128(), va3));
+        a3 = (const int8_t*) ((uintptr_t) a3 + k);
+
+        const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+        w = (const void*) ((uintptr_t) w + 8);
+        const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        vacc2x0123 = _mm_add_epi32(vacc2x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        vacc3x0123 = _mm_add_epi32(vacc3x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+        if (k > 2 * sizeof(int8_t)) {
+          const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
+          w = (const void*) ((uintptr_t) w + 8);
+          const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
+
+          vacc0x0123 = _mm_add_epi32(vacc0x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+          vacc1x0123 = _mm_add_epi32(vacc1x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+          vacc2x0123 = _mm_add_epi32(vacc2x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+          vacc3x0123 = _mm_add_epi32(vacc3x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+
+          if (k > 4 * sizeof(int8_t)) {
+            const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
+            w = (const void*) ((uintptr_t) w + 8);
+            const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
+
+            vacc0x0123 = _mm_add_epi32(vacc0x0123,
+              _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+            vacc1x0123 = _mm_add_epi32(vacc1x0123,
+              _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+            vacc2x0123 = _mm_add_epi32(vacc2x0123,
+              _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+            vacc3x0123 = _mm_add_epi32(vacc3x0123,
+              _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+          }
+        }
+      }
+      p -= 4 * sizeof(void*);
+    } while (p != 0);
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+    __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
+    __m128 vscaled3x0123 = _mm_cvtepi32_ps(vacc3x0123);
+
+    const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+    vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale0123);
+    vscaled3x0123 = _mm_mul_ps(vscaled3x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+    vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
+    vacc3x0123 = _mm_cvtps_epi32(vscaled3x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+    __m128i vacc23x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc3x0123), voutput_zero_point);
+
+    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
+    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
+    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
+    vacc23x0123 = _mm_min_epi16(_mm_max_epi16(vacc23x0123, voutput_min), voutput_max);
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc23x0123);
+
+
+    if (nc >= 4) {
+      *((uint32_t*) c3) = (uint32_t) _mm_cvtsi128_si32(_mm_shuffle_epi32(vout, _MM_SHUFFLE(3, 3, 3, 3)));
+      c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
+      *((uint32_t*) c2) = (uint32_t) _mm_cvtsi128_si32(_mm_shuffle_epi32(vout, _MM_SHUFFLE(2, 2, 2, 2)));
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(_mm_shuffle_epi32(vout, _MM_SHUFFLE(1, 1, 1, 1)));
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c3) = (uint16_t) _mm_extract_epi16(vout, 6);
+        c3 += 2;
+        *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4);
+        c2 += 2;
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c3) = (int8_t) _mm_extract_epi16(vout, 6);
+        *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
+        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-igemm/gen/4x4c2-minmax-fp32-sse41-ld128.c b/src/qc8-igemm/gen/4x4c2-minmax-fp32-sse41-ld128.c
new file mode 100644
index 0000000..0035013
--- /dev/null
+++ b/src/qc8-igemm/gen/4x4c2-minmax-fp32-sse41-ld128.c
@@ -0,0 +1,270 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/MRx4c2-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld128(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 4);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (4 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    c1 = c0;
+  }
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    c2 = c1;
+  }
+  int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 4) {
+    c3 = c2;
+  }
+
+  do {
+    __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
+    __m128i vacc1x0123 = vacc0x0123;
+    __m128i vacc2x0123 = vacc0x0123;
+    __m128i vacc3x0123 = vacc0x0123;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      const int8_t* restrict a2 = a[2];
+      if XNN_UNPREDICTABLE(a2 != zero) {
+        a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
+      }
+      const int8_t* restrict a3 = a[3];
+      if XNN_UNPREDICTABLE(a3 != zero) {
+        a3 = (const int8_t*) ((uintptr_t) a3 + a_offset);
+      }
+      a += 4;
+
+      size_t k = kc;
+      while (k >= 8 * sizeof(int8_t)) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+        a0 += 8;
+        const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+        const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+        a1 += 8;
+        const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+        const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
+        a2 += 8;
+        const __m128i va3 = _mm_loadl_epi64((const __m128i*) a3);
+        const __m128i vxa3 = _mm_cvtepi8_epi16(va3);
+        a3 += 8;
+
+        const __m128i vb01 = _mm_loadu_si128((const __m128i*) w);
+        const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
+        const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
+        const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        vacc2x0123 = _mm_add_epi32(vacc2x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        vacc3x0123 = _mm_add_epi32(vacc3x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        vacc2x0123 = _mm_add_epi32(vacc2x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        vacc3x0123 = _mm_add_epi32(vacc3x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16));
+        const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
+        const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
+        const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+        vacc2x0123 = _mm_add_epi32(vacc2x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+        vacc3x0123 = _mm_add_epi32(vacc3x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+        vacc2x0123 = _mm_add_epi32(vacc2x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+        vacc3x0123 = _mm_add_epi32(vacc3x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+
+        w = (const void*) ((uintptr_t) w + 32);
+        k -= 8 * sizeof(int8_t);
+      }
+      if (k != 0) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+        a0 = (const int8_t*) ((uintptr_t) a0 + k);
+        const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+        const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+        a1 = (const int8_t*) ((uintptr_t) a1 + k);
+        const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+        const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
+        a2 = (const int8_t*) ((uintptr_t) a2 + k);
+        const __m128i va3 = _mm_loadl_epi64((const __m128i*) a3);
+        const __m128i vxa3 = _mm_cvtepi8_epi16(va3);
+        a3 = (const int8_t*) ((uintptr_t) a3 + k);
+
+        const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+        w = (const void*) ((uintptr_t) w + 8);
+        const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        vacc2x0123 = _mm_add_epi32(vacc2x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        vacc3x0123 = _mm_add_epi32(vacc3x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+        if (k > 2 * sizeof(int8_t)) {
+          const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
+          w = (const void*) ((uintptr_t) w + 8);
+          const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
+
+          vacc0x0123 = _mm_add_epi32(vacc0x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+          vacc1x0123 = _mm_add_epi32(vacc1x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+          vacc2x0123 = _mm_add_epi32(vacc2x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+          vacc3x0123 = _mm_add_epi32(vacc3x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+
+          if (k > 4 * sizeof(int8_t)) {
+            const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
+            w = (const void*) ((uintptr_t) w + 8);
+            const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
+
+            vacc0x0123 = _mm_add_epi32(vacc0x0123,
+              _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+            vacc1x0123 = _mm_add_epi32(vacc1x0123,
+              _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+            vacc2x0123 = _mm_add_epi32(vacc2x0123,
+              _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+            vacc3x0123 = _mm_add_epi32(vacc3x0123,
+              _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+          }
+        }
+      }
+      p -= 4 * sizeof(void*);
+    } while (p != 0);
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+    __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
+    __m128 vscaled3x0123 = _mm_cvtepi32_ps(vacc3x0123);
+
+    const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+    vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale0123);
+    vscaled3x0123 = _mm_mul_ps(vscaled3x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+    vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
+    vacc3x0123 = _mm_cvtps_epi32(vscaled3x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+    __m128i vacc23x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc3x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc23x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c3) = (uint32_t) _mm_extract_epi32(vout, 3);
+      c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
+      *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c3) = (uint16_t) _mm_extract_epi16(vout, 6);
+        c3 += 2;
+        *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4);
+        c2 += 2;
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c3) = (int8_t) _mm_extract_epi8(vout, 12);
+        *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-igemm/gen/4x4c2-minmax-fp32-sse41-ld64.c b/src/qc8-igemm/gen/4x4c2-minmax-fp32-sse41-ld64.c
new file mode 100644
index 0000000..c4f0590
--- /dev/null
+++ b/src/qc8-igemm/gen/4x4c2-minmax-fp32-sse41-ld64.c
@@ -0,0 +1,270 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/MRx4c2-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 4);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (4 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    c1 = c0;
+  }
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    c2 = c1;
+  }
+  int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 4) {
+    c3 = c2;
+  }
+
+  do {
+    __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
+    __m128i vacc1x0123 = vacc0x0123;
+    __m128i vacc2x0123 = vacc0x0123;
+    __m128i vacc3x0123 = vacc0x0123;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      const int8_t* restrict a2 = a[2];
+      if XNN_UNPREDICTABLE(a2 != zero) {
+        a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
+      }
+      const int8_t* restrict a3 = a[3];
+      if XNN_UNPREDICTABLE(a3 != zero) {
+        a3 = (const int8_t*) ((uintptr_t) a3 + a_offset);
+      }
+      a += 4;
+
+      size_t k = kc;
+      while (k >= 8 * sizeof(int8_t)) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+        a0 += 8;
+        const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+        const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+        a1 += 8;
+        const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+        const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
+        a2 += 8;
+        const __m128i va3 = _mm_loadl_epi64((const __m128i*) a3);
+        const __m128i vxa3 = _mm_cvtepi8_epi16(va3);
+        a3 += 8;
+
+        const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+        const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        vacc2x0123 = _mm_add_epi32(vacc2x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        vacc3x0123 = _mm_add_epi32(vacc3x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8));
+        const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        vacc2x0123 = _mm_add_epi32(vacc2x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        vacc3x0123 = _mm_add_epi32(vacc3x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+        const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16));
+        const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+        vacc2x0123 = _mm_add_epi32(vacc2x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+        vacc3x0123 = _mm_add_epi32(vacc3x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+        const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24));
+        const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+        vacc2x0123 = _mm_add_epi32(vacc2x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+        vacc3x0123 = _mm_add_epi32(vacc3x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+
+        w = (const void*) ((uintptr_t) w + 32);
+        k -= 8 * sizeof(int8_t);
+      }
+      if (k != 0) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+        a0 = (const int8_t*) ((uintptr_t) a0 + k);
+        const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+        const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+        a1 = (const int8_t*) ((uintptr_t) a1 + k);
+        const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+        const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
+        a2 = (const int8_t*) ((uintptr_t) a2 + k);
+        const __m128i va3 = _mm_loadl_epi64((const __m128i*) a3);
+        const __m128i vxa3 = _mm_cvtepi8_epi16(va3);
+        a3 = (const int8_t*) ((uintptr_t) a3 + k);
+
+        const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+        w = (const void*) ((uintptr_t) w + 8);
+        const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
+
+        vacc0x0123 = _mm_add_epi32(vacc0x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        vacc1x0123 = _mm_add_epi32(vacc1x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        vacc2x0123 = _mm_add_epi32(vacc2x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+        vacc3x0123 = _mm_add_epi32(vacc3x0123,
+          _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+        if (k > 2 * sizeof(int8_t)) {
+          const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
+          w = (const void*) ((uintptr_t) w + 8);
+          const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
+
+          vacc0x0123 = _mm_add_epi32(vacc0x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+          vacc1x0123 = _mm_add_epi32(vacc1x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+          vacc2x0123 = _mm_add_epi32(vacc2x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+          vacc3x0123 = _mm_add_epi32(vacc3x0123,
+            _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+
+          if (k > 4 * sizeof(int8_t)) {
+            const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
+            w = (const void*) ((uintptr_t) w + 8);
+            const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
+
+            vacc0x0123 = _mm_add_epi32(vacc0x0123,
+              _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+            vacc1x0123 = _mm_add_epi32(vacc1x0123,
+              _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+            vacc2x0123 = _mm_add_epi32(vacc2x0123,
+              _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+            vacc3x0123 = _mm_add_epi32(vacc3x0123,
+              _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+          }
+        }
+      }
+      p -= 4 * sizeof(void*);
+    } while (p != 0);
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+    __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
+    __m128 vscaled3x0123 = _mm_cvtepi32_ps(vacc3x0123);
+
+    const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+    vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale0123);
+    vscaled3x0123 = _mm_mul_ps(vscaled3x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+    vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
+    vacc3x0123 = _mm_cvtps_epi32(vscaled3x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+    __m128i vacc23x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc3x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc23x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c3) = (uint32_t) _mm_extract_epi32(vout, 3);
+      c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
+      *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c3) = (uint16_t) _mm_extract_epi16(vout, 6);
+        c3 += 2;
+        *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4);
+        c2 += 2;
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c3) = (int8_t) _mm_extract_epi8(vout, 12);
+        *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-igemm/gen/4x4c2-minmax-fp32-xop-ld128.c b/src/qc8-igemm/gen/4x4c2-minmax-fp32-xop-ld128.c
new file mode 100644
index 0000000..dcd4f0a
--- /dev/null
+++ b/src/qc8-igemm/gen/4x4c2-minmax-fp32-xop-ld128.c
@@ -0,0 +1,275 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/MRx4c2-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#if defined(__GNUC__) || defined(__clang__)
+  #include <x86intrin.h>
+#else
+  #include <immintrin.h>
+  #include <ammintrin.h>
+#endif
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 4);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (4 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    c1 = c0;
+  }
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    c2 = c1;
+  }
+  int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 4) {
+    c3 = c2;
+  }
+
+  do {
+    __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
+    __m128i vacc1x0123 = vacc0x0123;
+    __m128i vacc2x0123 = vacc0x0123;
+    __m128i vacc3x0123 = vacc0x0123;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      const int8_t* restrict a2 = a[2];
+      if XNN_UNPREDICTABLE(a2 != zero) {
+        a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
+      }
+      const int8_t* restrict a3 = a[3];
+      if XNN_UNPREDICTABLE(a3 != zero) {
+        a3 = (const int8_t*) ((uintptr_t) a3 + a_offset);
+      }
+      a += 4;
+
+      size_t k = kc;
+      while (k >= 8 * sizeof(int8_t)) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+        a0 += 8;
+        const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+        const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+        a1 += 8;
+        const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+        const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
+        a2 += 8;
+        const __m128i va3 = _mm_loadl_epi64((const __m128i*) a3);
+        const __m128i vxa3 = _mm_cvtepi8_epi16(va3);
+        a3 += 8;
+
+        const __m128i vb01 = _mm_loadu_si128((const __m128i*) w);
+        const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
+        const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
+        const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+
+        vacc0x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
+        vacc1x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc1x0123);
+        vacc2x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc2x0123);
+        vacc3x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc3x0123);
+
+        vacc0x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
+        vacc1x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc1x0123);
+        vacc2x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc2x0123);
+        vacc3x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc3x0123);
+        const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16));
+        const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
+        const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
+        const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+
+        vacc0x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
+        vacc1x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc1x0123);
+        vacc2x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc2x0123);
+        vacc3x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc3x0123);
+
+        vacc0x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc0x0123);
+        vacc1x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc1x0123);
+        vacc2x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc2x0123);
+        vacc3x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc3x0123);
+
+        w = (const void*) ((uintptr_t) w + 32);
+        k -= 8 * sizeof(int8_t);
+      }
+      if (k != 0) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+        a0 = (const int8_t*) ((uintptr_t) a0 + k);
+        const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+        const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+        a1 = (const int8_t*) ((uintptr_t) a1 + k);
+        const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+        const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
+        a2 = (const int8_t*) ((uintptr_t) a2 + k);
+        const __m128i va3 = _mm_loadl_epi64((const __m128i*) a3);
+        const __m128i vxa3 = _mm_cvtepi8_epi16(va3);
+        a3 = (const int8_t*) ((uintptr_t) a3 + k);
+
+        const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+        w = (const void*) ((uintptr_t) w + 8);
+        const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
+
+        vacc0x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
+        vacc1x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc1x0123);
+        vacc2x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc2x0123);
+        vacc3x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc3x0123);
+
+        if (k > 2 * sizeof(int8_t)) {
+          const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
+          w = (const void*) ((uintptr_t) w + 8);
+          const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
+
+          vacc0x0123 = _mm_maddd_epi16(
+            _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
+          vacc1x0123 = _mm_maddd_epi16(
+            _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc1x0123);
+          vacc2x0123 = _mm_maddd_epi16(
+            _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc2x0123);
+          vacc3x0123 = _mm_maddd_epi16(
+            _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc3x0123);
+
+          if (k > 4 * sizeof(int8_t)) {
+            const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
+            w = (const void*) ((uintptr_t) w + 8);
+            const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
+
+            vacc0x0123 = _mm_maddd_epi16(
+              _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
+            vacc1x0123 = _mm_maddd_epi16(
+              _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc1x0123);
+            vacc2x0123 = _mm_maddd_epi16(
+              _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc2x0123);
+            vacc3x0123 = _mm_maddd_epi16(
+              _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc3x0123);
+          }
+        }
+      }
+      p -= 4 * sizeof(void*);
+    } while (p != 0);
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+    __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
+    __m128 vscaled3x0123 = _mm_cvtepi32_ps(vacc3x0123);
+
+    const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+    vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale0123);
+    vscaled3x0123 = _mm_mul_ps(vscaled3x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+    vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
+    vacc3x0123 = _mm_cvtps_epi32(vscaled3x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+    __m128i vacc23x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc3x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc23x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c3) = (uint32_t) _mm_extract_epi32(vout, 3);
+      c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
+      *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c3) = (uint16_t) _mm_extract_epi16(vout, 6);
+        c3 += 2;
+        *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4);
+        c2 += 2;
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c3) = (int8_t) _mm_extract_epi8(vout, 12);
+        *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qc8-igemm/gen/4x4c2-minmax-fp32-xop-ld64.c b/src/qc8-igemm/gen/4x4c2-minmax-fp32-xop-ld64.c
new file mode 100644
index 0000000..5b38754
--- /dev/null
+++ b/src/qc8-igemm/gen/4x4c2-minmax-fp32-xop-ld64.c
@@ -0,0 +1,275 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/MRx4c2-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#if defined(__GNUC__) || defined(__clang__)
+  #include <x86intrin.h>
+#else
+  #include <immintrin.h>
+  #include <ammintrin.h>
+#endif
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 4);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (4 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 2);
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    c1 = c0;
+  }
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    c2 = c1;
+  }
+  int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 4) {
+    c3 = c2;
+  }
+
+  do {
+    __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
+    __m128i vacc1x0123 = vacc0x0123;
+    __m128i vacc2x0123 = vacc0x0123;
+    __m128i vacc3x0123 = vacc0x0123;
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      const int8_t* restrict a2 = a[2];
+      if XNN_UNPREDICTABLE(a2 != zero) {
+        a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
+      }
+      const int8_t* restrict a3 = a[3];
+      if XNN_UNPREDICTABLE(a3 != zero) {
+        a3 = (const int8_t*) ((uintptr_t) a3 + a_offset);
+      }
+      a += 4;
+
+      size_t k = kc;
+      while (k >= 8 * sizeof(int8_t)) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+        a0 += 8;
+        const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+        const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+        a1 += 8;
+        const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+        const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
+        a2 += 8;
+        const __m128i va3 = _mm_loadl_epi64((const __m128i*) a3);
+        const __m128i vxa3 = _mm_cvtepi8_epi16(va3);
+        a3 += 8;
+
+        const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+        const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
+
+        vacc0x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
+        vacc1x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc1x0123);
+        vacc2x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc2x0123);
+        vacc3x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc3x0123);
+        const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8));
+        const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
+
+        vacc0x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
+        vacc1x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc1x0123);
+        vacc2x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc2x0123);
+        vacc3x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc3x0123);
+        const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16));
+        const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
+
+        vacc0x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
+        vacc1x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc1x0123);
+        vacc2x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc2x0123);
+        vacc3x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc3x0123);
+        const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24));
+        const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
+
+        vacc0x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc0x0123);
+        vacc1x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc1x0123);
+        vacc2x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc2x0123);
+        vacc3x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc3x0123);
+
+        w = (const void*) ((uintptr_t) w + 32);
+        k -= 8 * sizeof(int8_t);
+      }
+      if (k != 0) {
+        const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+        const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+        a0 = (const int8_t*) ((uintptr_t) a0 + k);
+        const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+        const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+        a1 = (const int8_t*) ((uintptr_t) a1 + k);
+        const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+        const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
+        a2 = (const int8_t*) ((uintptr_t) a2 + k);
+        const __m128i va3 = _mm_loadl_epi64((const __m128i*) a3);
+        const __m128i vxa3 = _mm_cvtepi8_epi16(va3);
+        a3 = (const int8_t*) ((uintptr_t) a3 + k);
+
+        const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+        w = (const void*) ((uintptr_t) w + 8);
+        const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
+
+        vacc0x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
+        vacc1x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc1x0123);
+        vacc2x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc2x0123);
+        vacc3x0123 = _mm_maddd_epi16(
+          _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc3x0123);
+
+        if (k > 2 * sizeof(int8_t)) {
+          const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
+          w = (const void*) ((uintptr_t) w + 8);
+          const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
+
+          vacc0x0123 = _mm_maddd_epi16(
+            _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
+          vacc1x0123 = _mm_maddd_epi16(
+            _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc1x0123);
+          vacc2x0123 = _mm_maddd_epi16(
+            _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc2x0123);
+          vacc3x0123 = _mm_maddd_epi16(
+            _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc3x0123);
+
+          if (k > 4 * sizeof(int8_t)) {
+            const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
+            w = (const void*) ((uintptr_t) w + 8);
+            const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
+
+            vacc0x0123 = _mm_maddd_epi16(
+              _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
+            vacc1x0123 = _mm_maddd_epi16(
+              _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc1x0123);
+            vacc2x0123 = _mm_maddd_epi16(
+              _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc2x0123);
+            vacc3x0123 = _mm_maddd_epi16(
+              _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc3x0123);
+          }
+        }
+      }
+      p -= 4 * sizeof(void*);
+    } while (p != 0);
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+    __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
+    __m128 vscaled3x0123 = _mm_cvtepi32_ps(vacc3x0123);
+
+    const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
+    w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
+    vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale0123);
+    vscaled3x0123 = _mm_mul_ps(vscaled3x0123, vscale0123);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+    vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
+    vacc3x0123 = _mm_cvtps_epi32(vscaled3x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+    __m128i vacc23x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc3x0123), voutput_zero_point);
+
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc23x0123);
+
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->sse4.output_max));
+
+    if (nc >= 4) {
+      *((uint32_t*) c3) = (uint32_t) _mm_extract_epi32(vout, 3);
+      c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
+      *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c3) = (uint16_t) _mm_extract_epi16(vout, 6);
+        c3 += 2;
+        *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4);
+        c2 += 2;
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *((int8_t*) c3) = (int8_t) _mm_extract_epi8(vout, 12);
+        *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-gemm/MRx4c2-sse.c.in b/src/qs8-gemm/MRx4c2-sse.c.in
index 27a0f0f..2d9ea8a 100644
--- a/src/qs8-gemm/MRx4c2-sse.c.in
+++ b/src/qs8-gemm/MRx4c2-sse.c.in
@@ -7,6 +7,7 @@
 $assert not XOP or AVX
 $assert not AVX or SSE == 4
 $assert REQUANTIZATION in ["GEMMLOWP", "FP32"]
+$assert not CHANNELWISE or REQUANTIZATION == "FP32"
 $assert SSE != 3 or REQUANTIZATION != "FP32"
 $assert VARIANT in ["LD64", "LD128", "EXTENDED"]
 $assert MR <= 4
@@ -27,11 +28,14 @@
 #include <xnnpack/math.h>
 
 
+
+$DATATYPE = "qc8" if CHANNELWISE else "qs8"
 $LOAD_SUFFIX = {"LD128": "_ld128", "LD64": "_ld64", "EXTENDED": ""}[VARIANT]
 $GEMM_SUFFIX = "_xw" if VARIANT == "EXTENDED" else ""
-$PARAMS_STRUCT = REQUANTIZATION.lower() + ("_sse4" if SSE >= 4 else "_sse2")
+$GEMM_PARAMS = "xnn_qs8_minmax_params" if CHANNELWISE else "xnn_qs8_conv_minmax_params"
+$PARAMS_STRUCT = ("" if CHANNELWISE else REQUANTIZATION.lower() + "_") + ("sse4" if SSE >= 4 else "sse2")
 $ISA = "xop" if XOP else "avx" if AVX else {2: "sse2", 3: "ssse3", 4: "sse41"}[SSE]
-void xnn_qs8_gemm${GEMM_SUFFIX}_minmax_${REQUANTIZATION.lower()}_ukernel_${MR}x4c2__${ISA}${LOAD_SUFFIX}(
+void xnn_${DATATYPE}_gemm${GEMM_SUFFIX}_minmax_${REQUANTIZATION.lower()}_ukernel_${MR}x4c2__${ISA}${LOAD_SUFFIX}(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -41,7 +45,7 @@
     int8_t* restrict c,
     size_t cm_stride,
     size_t cn_stride,
-    const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+    const union ${GEMM_PARAMS} params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
 {
   assert(mr != 0);
   assert(mr <= ${MR});
@@ -298,9 +302,15 @@
       $for M in range(MR):
         __m128 vscaled${M}x0123 = _mm_cvtepi32_ps(vacc${M}x0123);
 
-      const __m128 vscale = _mm_load_ps(params->fp32_${"sse4" if SSE >= 4 else "sse2"}.scale);
-      $for M in range(MR):
-        vscaled${M}x0123 = _mm_mul_ps(vscaled${M}x0123, vscale);
+      $if CHANNELWISE:
+        const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
+        w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+        $for M in range(MR):
+          vscaled${M}x0123 = _mm_mul_ps(vscaled${M}x0123, vscale0123);
+      $else:
+        const __m128 vscale = _mm_load_ps(params->${PARAMS_STRUCT}.scale);
+        $for M in range(MR):
+          vscaled${M}x0123 = _mm_mul_ps(vscaled${M}x0123, vscale);
 
       $for M in range(MR):
         vacc${M}x0123 = _mm_cvtps_epi32(vscaled${M}x0123);
diff --git a/src/qs8-gemm/MRx4c8-sse.c.in b/src/qs8-gemm/MRx4c8-sse.c.in
index 7c505d9..d50aff8 100644
--- a/src/qs8-gemm/MRx4c8-sse.c.in
+++ b/src/qs8-gemm/MRx4c8-sse.c.in
@@ -7,6 +7,7 @@
 $assert not XOP or AVX
 $assert not AVX or SSE == 4
 $assert REQUANTIZATION in ["GEMMLOWP", "FP32"]
+$assert not CHANNELWISE or REQUANTIZATION == "FP32"
 $assert VARIANT in ["LD64", "LD128", "EXTENDED"]
 $assert MR <= 4
 #include <assert.h>
@@ -26,11 +27,13 @@
 #include <xnnpack/math.h>
 
 
+$DATATYPE = "qc8" if CHANNELWISE else "qs8"
 $LOAD_SUFFIX = {"LD128": "_ld128", "LD64": "_ld64", "EXTENDED": ""}[VARIANT]
 $GEMM_SUFFIX = "_xw" if VARIANT == "EXTENDED" else ""
-$PARAMS_STRUCT = REQUANTIZATION.lower() + ("_sse4" if SSE >= 4 else "_sse2")
+$GEMM_PARAMS = "xnn_qs8_minmax_params" if CHANNELWISE else "xnn_qs8_conv_minmax_params"
+$PARAMS_STRUCT = ("" if CHANNELWISE else REQUANTIZATION.lower() + "_") + ("sse4" if SSE >= 4 else "sse2")
 $ISA = "xop" if XOP else "avx" if AVX else {2: "sse2", 3: "ssse3", 4: "sse41"}[SSE]
-void xnn_qs8_gemm${GEMM_SUFFIX}_minmax_${REQUANTIZATION.lower()}_ukernel_${MR}x4c8__${ISA}${LOAD_SUFFIX}(
+void xnn_${DATATYPE}_gemm${GEMM_SUFFIX}_minmax_${REQUANTIZATION.lower()}_ukernel_${MR}x4c8__${ISA}${LOAD_SUFFIX}(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -40,7 +43,7 @@
     int8_t* restrict c,
     size_t cm_stride,
     size_t cn_stride,
-    const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+    const union ${GEMM_PARAMS} params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
 {
   assert(mr != 0);
   assert(mr <= ${MR});
@@ -234,9 +237,15 @@
       $for M in range(MR):
         __m128 vscaled${M}x0123 = _mm_cvtepi32_ps(vacc${M}x0123);
 
-      const __m128 vscale = _mm_load_ps(params->fp32_${"sse4" if SSE >= 4 else "sse2"}.scale);
-      $for M in range(MR):
-        vscaled${M}x0123 = _mm_mul_ps(vscaled${M}x0123, vscale);
+      $if CHANNELWISE:
+        const __m128 vscale0123 = _mm_load_ps((const float*) w);
+        w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+        $for M in range(MR):
+          vscaled${M}x0123 = _mm_mul_ps(vscaled${M}x0123, vscale0123);
+      $else:
+        const __m128 vscale = _mm_load_ps(params->${PARAMS_STRUCT}.scale);
+        $for M in range(MR):
+          vscaled${M}x0123 = _mm_mul_ps(vscaled${M}x0123, vscale);
 
       $for M in range(MR):
         vacc${M}x0123 = _mm_cvtps_epi32(vscaled${M}x0123);
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-fp32-avx-ld128.c b/src/qs8-gemm/gen/1x4c2-minmax-fp32-avx-ld128.c
index 841fdc5..6f94966 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-fp32-avx-ld128.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-fp32-avx-ld128.c
@@ -15,6 +15,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-fp32-avx-ld64.c b/src/qs8-gemm/gen/1x4c2-minmax-fp32-avx-ld64.c
index ecc7bf0..d170ad2 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-fp32-avx-ld64.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-fp32-avx-ld64.c
@@ -15,6 +15,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-fp32-sse2-ld128.c b/src/qs8-gemm/gen/1x4c2-minmax-fp32-sse2-ld128.c
index 202d416..4b6d42a 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-fp32-sse2-ld128.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-fp32-sse2-ld128.c
@@ -15,6 +15,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-fp32-sse2-ld64.c b/src/qs8-gemm/gen/1x4c2-minmax-fp32-sse2-ld64.c
index 3e8d08d..8d2f47b 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-fp32-sse2-ld64.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-fp32-sse2-ld64.c
@@ -15,6 +15,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-fp32-sse41-ld128.c b/src/qs8-gemm/gen/1x4c2-minmax-fp32-sse41-ld128.c
index 5ee38c5..5f6cae8 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-fp32-sse41-ld128.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-fp32-sse41-ld128.c
@@ -15,6 +15,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-fp32-sse41-ld64.c b/src/qs8-gemm/gen/1x4c2-minmax-fp32-sse41-ld64.c
index 4a61f66..199412c 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-fp32-sse41-ld64.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-fp32-sse41-ld64.c
@@ -15,6 +15,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-fp32-xop-ld128.c b/src/qs8-gemm/gen/1x4c2-minmax-fp32-xop-ld128.c
index c971ef9..4715cf7 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-fp32-xop-ld128.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-fp32-xop-ld128.c
@@ -20,6 +20,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-fp32-xop-ld64.c b/src/qs8-gemm/gen/1x4c2-minmax-fp32-xop-ld64.c
index cac9d9e..881d62d 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-fp32-xop-ld64.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-fp32-xop-ld64.c
@@ -20,6 +20,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-avx-ld128.c b/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-avx-ld128.c
index 427cb9f..8836720 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-avx-ld128.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-avx-ld128.c
@@ -15,6 +15,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x4c2__avx_ld128(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-avx-ld64.c b/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-avx-ld64.c
index b1c30ad..2129bc1 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-avx-ld64.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-avx-ld64.c
@@ -15,6 +15,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x4c2__avx_ld64(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-sse2-ld128.c b/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-sse2-ld128.c
index 1836237..0abda50 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-sse2-ld128.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-sse2-ld128.c
@@ -15,6 +15,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x4c2__sse2_ld128(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-sse2-ld64.c b/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-sse2-ld64.c
index 4d16fd0..4d7ce74 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-sse2-ld64.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-sse2-ld64.c
@@ -15,6 +15,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x4c2__sse2_ld64(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-sse41-ld128.c b/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-sse41-ld128.c
index e97a400..b4a9b0c 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-sse41-ld128.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-sse41-ld128.c
@@ -15,6 +15,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x4c2__sse41_ld128(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-sse41-ld64.c b/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-sse41-ld64.c
index 278ad8f..ffb00ef 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-sse41-ld64.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-sse41-ld64.c
@@ -15,6 +15,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x4c2__sse41_ld64(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-ssse3-ld128.c b/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-ssse3-ld128.c
index f9cb7c7..38464ee 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-ssse3-ld128.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-ssse3-ld128.c
@@ -15,6 +15,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x4c2__ssse3_ld128(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-ssse3-ld64.c b/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-ssse3-ld64.c
index 6e7a449..fc75b15 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-ssse3-ld64.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-ssse3-ld64.c
@@ -15,6 +15,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x4c2__ssse3_ld64(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-xop-ld128.c b/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-xop-ld128.c
index 21adf15..8c30df8 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-xop-ld128.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-xop-ld128.c
@@ -20,6 +20,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x4c2__xop_ld128(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-xop-ld64.c b/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-xop-ld64.c
index 0b07fc7..6d69274 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-xop-ld64.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-xop-ld64.c
@@ -20,6 +20,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x4c2__xop_ld64(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-avx.c b/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-avx.c
index 3fdd41b..cccabb5 100644
--- a/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-avx.c
+++ b/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-avx.c
@@ -15,6 +15,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__avx(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-sse2.c b/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-sse2.c
index 37bf918..f2d23c2 100644
--- a/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-sse2.c
+++ b/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-sse2.c
@@ -15,6 +15,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse2(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-sse41.c b/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-sse41.c
index ca397ec..ab4b779 100644
--- a/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-sse41.c
+++ b/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-sse41.c
@@ -15,6 +15,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse41(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-ssse3.c b/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-ssse3.c
index 946b93d..b56e936 100644
--- a/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-ssse3.c
+++ b/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-ssse3.c
@@ -15,6 +15,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__ssse3(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-xop.c b/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-xop.c
index 543f507..f756875 100644
--- a/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-xop.c
+++ b/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-xop.c
@@ -20,6 +20,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__xop(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/2x4c2-minmax-fp32-avx-ld128.c b/src/qs8-gemm/gen/2x4c2-minmax-fp32-avx-ld128.c
index c03b700..2516513 100644
--- a/src/qs8-gemm/gen/2x4c2-minmax-fp32-avx-ld128.c
+++ b/src/qs8-gemm/gen/2x4c2-minmax-fp32-avx-ld128.c
@@ -15,6 +15,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/2x4c2-minmax-fp32-avx-ld64.c b/src/qs8-gemm/gen/2x4c2-minmax-fp32-avx-ld64.c
index 28c0f03..7a09d5a 100644
--- a/src/qs8-gemm/gen/2x4c2-minmax-fp32-avx-ld64.c
+++ b/src/qs8-gemm/gen/2x4c2-minmax-fp32-avx-ld64.c
@@ -15,6 +15,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/2x4c2-minmax-fp32-sse2-ld128.c b/src/qs8-gemm/gen/2x4c2-minmax-fp32-sse2-ld128.c
index 31f042a..7fbfbad 100644
--- a/src/qs8-gemm/gen/2x4c2-minmax-fp32-sse2-ld128.c
+++ b/src/qs8-gemm/gen/2x4c2-minmax-fp32-sse2-ld128.c
@@ -15,6 +15,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/2x4c2-minmax-fp32-sse2-ld64.c b/src/qs8-gemm/gen/2x4c2-minmax-fp32-sse2-ld64.c
index cc1da18..0f691a5 100644
--- a/src/qs8-gemm/gen/2x4c2-minmax-fp32-sse2-ld64.c
+++ b/src/qs8-gemm/gen/2x4c2-minmax-fp32-sse2-ld64.c
@@ -15,6 +15,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/2x4c2-minmax-fp32-sse41-ld128.c b/src/qs8-gemm/gen/2x4c2-minmax-fp32-sse41-ld128.c
index dcbe48c..c7f020e 100644
--- a/src/qs8-gemm/gen/2x4c2-minmax-fp32-sse41-ld128.c
+++ b/src/qs8-gemm/gen/2x4c2-minmax-fp32-sse41-ld128.c
@@ -15,6 +15,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/2x4c2-minmax-fp32-sse41-ld64.c b/src/qs8-gemm/gen/2x4c2-minmax-fp32-sse41-ld64.c
index 5b42ad3..e88bd8f 100644
--- a/src/qs8-gemm/gen/2x4c2-minmax-fp32-sse41-ld64.c
+++ b/src/qs8-gemm/gen/2x4c2-minmax-fp32-sse41-ld64.c
@@ -15,6 +15,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/2x4c2-minmax-fp32-xop-ld128.c b/src/qs8-gemm/gen/2x4c2-minmax-fp32-xop-ld128.c
index bb8681d..dfe624b 100644
--- a/src/qs8-gemm/gen/2x4c2-minmax-fp32-xop-ld128.c
+++ b/src/qs8-gemm/gen/2x4c2-minmax-fp32-xop-ld128.c
@@ -20,6 +20,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/2x4c2-minmax-fp32-xop-ld64.c b/src/qs8-gemm/gen/2x4c2-minmax-fp32-xop-ld64.c
index a7b8771..afd0f7d 100644
--- a/src/qs8-gemm/gen/2x4c2-minmax-fp32-xop-ld64.c
+++ b/src/qs8-gemm/gen/2x4c2-minmax-fp32-xop-ld64.c
@@ -20,6 +20,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-avx-ld128.c b/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-avx-ld128.c
index de0580f..27938c0 100644
--- a/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-avx-ld128.c
+++ b/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-avx-ld128.c
@@ -15,6 +15,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c2__avx_ld128(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-avx-ld64.c b/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-avx-ld64.c
index 4213ba5..fb564ff 100644
--- a/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-avx-ld64.c
+++ b/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-avx-ld64.c
@@ -15,6 +15,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c2__avx_ld64(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-sse2-ld128.c b/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-sse2-ld128.c
index 3314380..1c2fc39 100644
--- a/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-sse2-ld128.c
+++ b/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-sse2-ld128.c
@@ -15,6 +15,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c2__sse2_ld128(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-sse2-ld64.c b/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-sse2-ld64.c
index 3e0a980..6e2366c 100644
--- a/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-sse2-ld64.c
+++ b/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-sse2-ld64.c
@@ -15,6 +15,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c2__sse2_ld64(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-sse41-ld128.c b/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-sse41-ld128.c
index e59f622..417b08a 100644
--- a/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-sse41-ld128.c
+++ b/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-sse41-ld128.c
@@ -15,6 +15,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c2__sse41_ld128(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-sse41-ld64.c b/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-sse41-ld64.c
index 5bbf9c3..0171f45 100644
--- a/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-sse41-ld64.c
+++ b/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-sse41-ld64.c
@@ -15,6 +15,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c2__sse41_ld64(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-ssse3-ld128.c b/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-ssse3-ld128.c
index a1eb413..37bb94c 100644
--- a/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-ssse3-ld128.c
+++ b/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-ssse3-ld128.c
@@ -15,6 +15,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c2__ssse3_ld128(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-ssse3-ld64.c b/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-ssse3-ld64.c
index d917ca9..f4a4f3a 100644
--- a/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-ssse3-ld64.c
+++ b/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-ssse3-ld64.c
@@ -15,6 +15,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c2__ssse3_ld64(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-xop-ld128.c b/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-xop-ld128.c
index 42fec23..84465b6 100644
--- a/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-xop-ld128.c
+++ b/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-xop-ld128.c
@@ -20,6 +20,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c2__xop_ld128(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-xop-ld64.c b/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-xop-ld64.c
index f225098..7ebabff 100644
--- a/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-xop-ld64.c
+++ b/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-xop-ld64.c
@@ -20,6 +20,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c2__xop_ld64(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-avx.c b/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-avx.c
index 1f998e1..df31557 100644
--- a/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-avx.c
+++ b/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-avx.c
@@ -15,6 +15,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c2__avx(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-sse2.c b/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-sse2.c
index ea6a894..b834777 100644
--- a/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-sse2.c
+++ b/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-sse2.c
@@ -15,6 +15,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c2__sse2(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-sse41.c b/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-sse41.c
index ff5840a..0aa0edd 100644
--- a/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-sse41.c
+++ b/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-sse41.c
@@ -15,6 +15,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c2__sse41(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-ssse3.c b/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-ssse3.c
index 895b0cf..56958e0 100644
--- a/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-ssse3.c
+++ b/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-ssse3.c
@@ -15,6 +15,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c2__ssse3(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-xop.c b/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-xop.c
index 4bc8c55..372a2b8 100644
--- a/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-xop.c
+++ b/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-xop.c
@@ -20,6 +20,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c2__xop(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/3x4c2-minmax-fp32-avx-ld128.c b/src/qs8-gemm/gen/3x4c2-minmax-fp32-avx-ld128.c
index 7bc2350..6081a4f 100644
--- a/src/qs8-gemm/gen/3x4c2-minmax-fp32-avx-ld128.c
+++ b/src/qs8-gemm/gen/3x4c2-minmax-fp32-avx-ld128.c
@@ -15,6 +15,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/3x4c2-minmax-fp32-avx-ld64.c b/src/qs8-gemm/gen/3x4c2-minmax-fp32-avx-ld64.c
index 8d3eaa3..675b191 100644
--- a/src/qs8-gemm/gen/3x4c2-minmax-fp32-avx-ld64.c
+++ b/src/qs8-gemm/gen/3x4c2-minmax-fp32-avx-ld64.c
@@ -15,6 +15,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/3x4c2-minmax-fp32-sse2-ld128.c b/src/qs8-gemm/gen/3x4c2-minmax-fp32-sse2-ld128.c
index bf5d673..e3bdff8 100644
--- a/src/qs8-gemm/gen/3x4c2-minmax-fp32-sse2-ld128.c
+++ b/src/qs8-gemm/gen/3x4c2-minmax-fp32-sse2-ld128.c
@@ -15,6 +15,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/3x4c2-minmax-fp32-sse2-ld64.c b/src/qs8-gemm/gen/3x4c2-minmax-fp32-sse2-ld64.c
index 2040f86..766bcc4 100644
--- a/src/qs8-gemm/gen/3x4c2-minmax-fp32-sse2-ld64.c
+++ b/src/qs8-gemm/gen/3x4c2-minmax-fp32-sse2-ld64.c
@@ -15,6 +15,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/3x4c2-minmax-fp32-sse41-ld128.c b/src/qs8-gemm/gen/3x4c2-minmax-fp32-sse41-ld128.c
index 77be744..88bd56d 100644
--- a/src/qs8-gemm/gen/3x4c2-minmax-fp32-sse41-ld128.c
+++ b/src/qs8-gemm/gen/3x4c2-minmax-fp32-sse41-ld128.c
@@ -15,6 +15,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/3x4c2-minmax-fp32-sse41-ld64.c b/src/qs8-gemm/gen/3x4c2-minmax-fp32-sse41-ld64.c
index 8ccc871..2d1bad6 100644
--- a/src/qs8-gemm/gen/3x4c2-minmax-fp32-sse41-ld64.c
+++ b/src/qs8-gemm/gen/3x4c2-minmax-fp32-sse41-ld64.c
@@ -15,6 +15,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/3x4c2-minmax-fp32-xop-ld128.c b/src/qs8-gemm/gen/3x4c2-minmax-fp32-xop-ld128.c
index 8d862f3..5b4d68d 100644
--- a/src/qs8-gemm/gen/3x4c2-minmax-fp32-xop-ld128.c
+++ b/src/qs8-gemm/gen/3x4c2-minmax-fp32-xop-ld128.c
@@ -20,6 +20,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/3x4c2-minmax-fp32-xop-ld64.c b/src/qs8-gemm/gen/3x4c2-minmax-fp32-xop-ld64.c
index 60cce10..77fcc41 100644
--- a/src/qs8-gemm/gen/3x4c2-minmax-fp32-xop-ld64.c
+++ b/src/qs8-gemm/gen/3x4c2-minmax-fp32-xop-ld64.c
@@ -20,6 +20,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-avx-ld128.c b/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-avx-ld128.c
index acf1eb3..b185bd7 100644
--- a/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-avx-ld128.c
+++ b/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-avx-ld128.c
@@ -15,6 +15,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c2__avx_ld128(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-avx-ld64.c b/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-avx-ld64.c
index 1d059b7..db7f469 100644
--- a/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-avx-ld64.c
+++ b/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-avx-ld64.c
@@ -15,6 +15,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c2__avx_ld64(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-sse2-ld128.c b/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-sse2-ld128.c
index 8cc7035..797d5f5 100644
--- a/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-sse2-ld128.c
+++ b/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-sse2-ld128.c
@@ -15,6 +15,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c2__sse2_ld128(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-sse2-ld64.c b/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-sse2-ld64.c
index a01a80a..39fa2da 100644
--- a/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-sse2-ld64.c
+++ b/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-sse2-ld64.c
@@ -15,6 +15,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c2__sse2_ld64(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-sse41-ld128.c b/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-sse41-ld128.c
index a6a16fa..a6d3751 100644
--- a/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-sse41-ld128.c
+++ b/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-sse41-ld128.c
@@ -15,6 +15,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c2__sse41_ld128(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-sse41-ld64.c b/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-sse41-ld64.c
index c60f805..976826e 100644
--- a/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-sse41-ld64.c
+++ b/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-sse41-ld64.c
@@ -15,6 +15,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c2__sse41_ld64(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-ssse3-ld128.c b/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-ssse3-ld128.c
index b693059..c6adbd4 100644
--- a/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-ssse3-ld128.c
+++ b/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-ssse3-ld128.c
@@ -15,6 +15,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c2__ssse3_ld128(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-ssse3-ld64.c b/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-ssse3-ld64.c
index 7193059..bf5faf3 100644
--- a/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-ssse3-ld64.c
+++ b/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-ssse3-ld64.c
@@ -15,6 +15,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c2__ssse3_ld64(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-xop-ld128.c b/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-xop-ld128.c
index 525d542..1941a8d 100644
--- a/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-xop-ld128.c
+++ b/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-xop-ld128.c
@@ -20,6 +20,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c2__xop_ld128(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-xop-ld64.c b/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-xop-ld64.c
index 9fb43b5..378abc7 100644
--- a/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-xop-ld64.c
+++ b/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-xop-ld64.c
@@ -20,6 +20,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x4c2__xop_ld64(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-avx.c b/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-avx.c
index b58ef12..48bea9e 100644
--- a/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-avx.c
+++ b/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-avx.c
@@ -15,6 +15,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c2__avx(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-sse2.c b/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-sse2.c
index e43eb3e..8af9bb3 100644
--- a/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-sse2.c
+++ b/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-sse2.c
@@ -15,6 +15,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c2__sse2(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-sse41.c b/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-sse41.c
index 22dd63a..7c1fc2f 100644
--- a/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-sse41.c
+++ b/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-sse41.c
@@ -15,6 +15,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c2__sse41(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-ssse3.c b/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-ssse3.c
index cb0a4f4..4f606f2 100644
--- a/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-ssse3.c
+++ b/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-ssse3.c
@@ -15,6 +15,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c2__ssse3(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-xop.c b/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-xop.c
index b0554bc..7d5ddb7 100644
--- a/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-xop.c
+++ b/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-xop.c
@@ -20,6 +20,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c2__xop(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-fp32-avx-ld128.c b/src/qs8-gemm/gen/4x4c2-minmax-fp32-avx-ld128.c
index cd5dce3..ea8c57a 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-fp32-avx-ld128.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-fp32-avx-ld128.c
@@ -15,6 +15,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-fp32-avx-ld64.c b/src/qs8-gemm/gen/4x4c2-minmax-fp32-avx-ld64.c
index 7d8c3e2..34cd524 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-fp32-avx-ld64.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-fp32-avx-ld64.c
@@ -15,6 +15,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-fp32-sse2-ld128.c b/src/qs8-gemm/gen/4x4c2-minmax-fp32-sse2-ld128.c
index 29dee36..f399baf 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-fp32-sse2-ld128.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-fp32-sse2-ld128.c
@@ -15,6 +15,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-fp32-sse2-ld64.c b/src/qs8-gemm/gen/4x4c2-minmax-fp32-sse2-ld64.c
index 2bb7875..8c8622b 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-fp32-sse2-ld64.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-fp32-sse2-ld64.c
@@ -15,6 +15,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-fp32-sse41-ld128.c b/src/qs8-gemm/gen/4x4c2-minmax-fp32-sse41-ld128.c
index 539b395..a913429 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-fp32-sse41-ld128.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-fp32-sse41-ld128.c
@@ -15,6 +15,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-fp32-sse41-ld64.c b/src/qs8-gemm/gen/4x4c2-minmax-fp32-sse41-ld64.c
index edc063a..842c4b8 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-fp32-sse41-ld64.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-fp32-sse41-ld64.c
@@ -15,6 +15,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-fp32-xop-ld128.c b/src/qs8-gemm/gen/4x4c2-minmax-fp32-xop-ld128.c
index 9a82755..ca5000e 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-fp32-xop-ld128.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-fp32-xop-ld128.c
@@ -20,6 +20,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-fp32-xop-ld64.c b/src/qs8-gemm/gen/4x4c2-minmax-fp32-xop-ld64.c
index 3ae703b..6d6f815 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-fp32-xop-ld64.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-fp32-xop-ld64.c
@@ -20,6 +20,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-avx-ld128.c b/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-avx-ld128.c
index 6ee3c62..33d3d40 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-avx-ld128.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-avx-ld128.c
@@ -15,6 +15,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x4c2__avx_ld128(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-avx-ld64.c b/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-avx-ld64.c
index 939d732..fcf7b1e 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-avx-ld64.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-avx-ld64.c
@@ -15,6 +15,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x4c2__avx_ld64(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-sse2-ld128.c b/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-sse2-ld128.c
index df5ede3..13060e4 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-sse2-ld128.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-sse2-ld128.c
@@ -15,6 +15,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x4c2__sse2_ld128(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-sse2-ld64.c b/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-sse2-ld64.c
index 9882976..9077840 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-sse2-ld64.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-sse2-ld64.c
@@ -15,6 +15,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x4c2__sse2_ld64(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-sse41-ld128.c b/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-sse41-ld128.c
index a7ea9d9..1fd19ad 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-sse41-ld128.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-sse41-ld128.c
@@ -15,6 +15,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x4c2__sse41_ld128(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-sse41-ld64.c b/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-sse41-ld64.c
index 64fb13f..bc5046e 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-sse41-ld64.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-sse41-ld64.c
@@ -15,6 +15,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x4c2__sse41_ld64(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-ssse3-ld128.c b/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-ssse3-ld128.c
index 739e36b..a7d464f 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-ssse3-ld128.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-ssse3-ld128.c
@@ -15,6 +15,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x4c2__ssse3_ld128(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-ssse3-ld64.c b/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-ssse3-ld64.c
index fb97abd..1e11bfc 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-ssse3-ld64.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-ssse3-ld64.c
@@ -15,6 +15,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x4c2__ssse3_ld64(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-xop-ld128.c b/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-xop-ld128.c
index 4a2cf7b..1b6b499 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-xop-ld128.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-xop-ld128.c
@@ -20,6 +20,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x4c2__xop_ld128(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-xop-ld64.c b/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-xop-ld64.c
index 0af6be9..589bb37 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-xop-ld64.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-xop-ld64.c
@@ -20,6 +20,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x4c2__xop_ld64(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-avx.c b/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-avx.c
index de0f304..fc5d996 100644
--- a/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-avx.c
+++ b/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-avx.c
@@ -15,6 +15,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__avx(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-sse2.c b/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-sse2.c
index ed56800..f70d278 100644
--- a/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-sse2.c
+++ b/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-sse2.c
@@ -15,6 +15,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse2(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-sse41.c b/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-sse41.c
index 29df1a0..de4fe5a 100644
--- a/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-sse41.c
+++ b/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-sse41.c
@@ -15,6 +15,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse41(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-ssse3.c b/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-ssse3.c
index c52fd01..1a1877a 100644
--- a/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-ssse3.c
+++ b/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-ssse3.c
@@ -15,6 +15,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__ssse3(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-xop.c b/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-xop.c
index b4a5242..3d76b26 100644
--- a/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-xop.c
+++ b/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-xop.c
@@ -20,6 +20,7 @@
 #include <xnnpack/math.h>
 
 
+
 void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__xop(
     size_t mr,
     size_t nc,
diff --git a/src/qs8-igemm/MRx4c2-sse.c.in b/src/qs8-igemm/MRx4c2-sse.c.in
index 59eb3bd..84b2b8c 100644
--- a/src/qs8-igemm/MRx4c2-sse.c.in
+++ b/src/qs8-igemm/MRx4c2-sse.c.in
@@ -7,6 +7,7 @@
 $assert not XOP or AVX
 $assert not AVX or SSE == 4
 $assert REQUANTIZATION in ["GEMMLOWP", "FP32"]
+$assert not CHANNELWISE or REQUANTIZATION == "FP32"
 $assert SSE != 3 or REQUANTIZATION != "FP32"
 $assert VARIANT in ["LD64", "LD128"]
 $assert MR <= 4
@@ -27,9 +28,11 @@
 #include <xnnpack/math.h>
 
 
-$PARAMS_STRUCT = REQUANTIZATION.lower() + ("_sse4" if SSE >= 4 else "_sse2")
+$DATATYPE = "qc8" if CHANNELWISE else "qs8"
+$GEMM_PARAMS = "xnn_qs8_minmax_params" if CHANNELWISE else "xnn_qs8_conv_minmax_params"
+$PARAMS_STRUCT = ("" if CHANNELWISE else REQUANTIZATION.lower() + "_") + ("sse4" if SSE >= 4 else "sse2")
 $ISA = "xop" if XOP else "avx" if AVX else {2: "sse2", 3: "ssse3", 4: "sse41"}[SSE]
-void xnn_qs8_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_${MR}x4c2__${ISA}_${VARIANT.lower()}(
+void xnn_${DATATYPE}_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_${MR}x4c2__${ISA}_${VARIANT.lower()}(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -41,7 +44,7 @@
     size_t cn_stride,
     size_t a_offset,
     const int8_t* zero,
-    const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+    const union ${GEMM_PARAMS} params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
 {
   assert(mr != 0);
   assert(mr <= ${MR});
@@ -276,9 +279,15 @@
       $for M in range(MR):
         __m128 vscaled${M}x0123 = _mm_cvtepi32_ps(vacc${M}x0123);
 
-      const __m128 vscale = _mm_load_ps(params->fp32_${"sse4" if SSE >= 4 else "sse2"}.scale);
-      $for M in range(MR):
-        vscaled${M}x0123 = _mm_mul_ps(vscaled${M}x0123, vscale);
+      $if CHANNELWISE:
+        const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
+        w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+        $for M in range(MR):
+          vscaled${M}x0123 = _mm_mul_ps(vscaled${M}x0123, vscale0123);
+      $else:
+        const __m128 vscale = _mm_load_ps(params->${PARAMS_STRUCT}.scale);
+        $for M in range(MR):
+          vscaled${M}x0123 = _mm_mul_ps(vscaled${M}x0123, vscale);
 
       $for M in range(MR):
         vacc${M}x0123 = _mm_cvtps_epi32(vscaled${M}x0123);
diff --git a/src/qs8-igemm/MRx4c8-sse.c.in b/src/qs8-igemm/MRx4c8-sse.c.in
index 86ce128..9b9e868 100644
--- a/src/qs8-igemm/MRx4c8-sse.c.in
+++ b/src/qs8-igemm/MRx4c8-sse.c.in
@@ -7,6 +7,7 @@
 $assert not XOP or AVX
 $assert not AVX or SSE == 4
 $assert REQUANTIZATION in ["GEMMLOWP", "FP32"]
+$assert not CHANNELWISE or REQUANTIZATION == "FP32"
 $assert VARIANT in ["LD64", "LD128"]
 $assert MR <= 4
 #include <assert.h>
@@ -26,9 +27,11 @@
 #include <xnnpack/math.h>
 
 
-$PARAMS_STRUCT = REQUANTIZATION.lower() + ("_sse4" if SSE >= 4 else "_sse2")
+$DATATYPE = "qc8" if CHANNELWISE else "qs8"
+$GEMM_PARAMS = "xnn_qs8_minmax_params" if CHANNELWISE else "xnn_qs8_conv_minmax_params"
+$PARAMS_STRUCT = ("" if CHANNELWISE else REQUANTIZATION.lower() + "_") + ("sse4" if SSE >= 4 else "sse2")
 $ISA = "xop" if XOP else "avx" if AVX else {2: "sse2", 3: "ssse3", 4: "sse41"}[SSE]
-void xnn_qs8_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_${MR}x4c8__${ISA}_${VARIANT.lower()}(
+void xnn_${DATATYPE}_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_${MR}x4c8__${ISA}_${VARIANT.lower()}(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -40,7 +43,7 @@
     size_t cn_stride,
     size_t a_offset,
     const int8_t* zero,
-    const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+    const union ${GEMM_PARAMS} params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
 {
   assert(mr != 0);
   assert(mr <= ${MR});
@@ -233,9 +236,15 @@
       $for M in range(MR):
         __m128 vscaled${M}x0123 = _mm_cvtepi32_ps(vacc${M}x0123);
 
-      const __m128 vscale = _mm_load_ps(params->fp32_${"sse4" if SSE >= 4 else "sse2"}.scale);
-      $for M in range(MR):
-        vscaled${M}x0123 = _mm_mul_ps(vscaled${M}x0123, vscale);
+      $if CHANNELWISE:
+        const __m128 vscale0123 = _mm_load_ps((const float*) w);
+        w = (const void*) ((uintptr_t) w + 4 * sizeof(float));
+        $for M in range(MR):
+          vscaled${M}x0123 = _mm_mul_ps(vscaled${M}x0123, vscale0123);
+      $else:
+        const __m128 vscale = _mm_load_ps(params->${PARAMS_STRUCT}.scale);
+        $for M in range(MR):
+          vscaled${M}x0123 = _mm_mul_ps(vscaled${M}x0123, vscale);
 
       $for M in range(MR):
         vacc${M}x0123 = _mm_cvtps_epi32(vscaled${M}x0123);
diff --git a/src/xnnpack/gemm.h b/src/xnnpack/gemm.h
index adc9a5c..ce4f81a 100644
--- a/src/xnnpack/gemm.h
+++ b/src/xnnpack/gemm.h
@@ -919,6 +919,78 @@
       size_t cn_stride,                                   \
       const union xnn_qs8_minmax_params* params);
 
+DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64)
+DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64)
+DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64)
+DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64)
+
+DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64)
+DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64)
+DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64)
+DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64)
+
+DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64)
+DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64)
+DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64)
+DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64)
+
+DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64)
+DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64)
+DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64)
+DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64)
+
+DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128)
+DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128)
+DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128)
+DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128)
+
+DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128)
+DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128)
+DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128)
+DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128)
+
+DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128)
+DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128)
+DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128)
+DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128)
+
+DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128)
+DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128)
+DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128)
+DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128)
+
+DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64)
+DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64)
+DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64)
+
+DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64)
+DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64)
+DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64)
+
+DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64)
+DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64)
+DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64)
+
+DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64)
+DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64)
+DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64)
+
+DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128)
+DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128)
+DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128)
+
+DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128)
+DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128)
+DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128)
+
+DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128)
+DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128)
+DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128)
+
+DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128)
+DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128)
+DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128)
+
 DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__avx2)
 DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__avx2)
 DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_3x8c8__avx2)
diff --git a/src/xnnpack/igemm.h b/src/xnnpack/igemm.h
index 9608f1a..d24bda7 100644
--- a/src/xnnpack/igemm.h
+++ b/src/xnnpack/igemm.h
@@ -667,6 +667,78 @@
       const int8_t* zero,                                  \
       const union xnn_qs8_minmax_params* params);
 
+DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64)
+DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld64)
+DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld64)
+DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld64)
+
+DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64)
+DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld64)
+DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld64)
+DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64)
+
+DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64)
+DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld64)
+DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld64)
+DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld64)
+
+DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64)
+DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld64)
+DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld64)
+DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64)
+
+DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128)
+DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld128)
+DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld128)
+DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld128)
+
+DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128)
+DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld128)
+DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld128)
+DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld128)
+
+DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128)
+DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld128)
+DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld128)
+DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128)
+
+DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128)
+DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld128)
+DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld128)
+DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128)
+
+DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64)
+DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld64)
+DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64)
+
+DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64)
+DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld64)
+DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64)
+
+DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64)
+DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld64)
+DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld64)
+
+DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64)
+DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64)
+DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld64)
+
+DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128)
+DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld128)
+DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld128)
+
+DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128)
+DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld128)
+DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld128)
+
+DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128)
+DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128)
+DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld128)
+
+DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld128)
+DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld128)
+DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld128)
+
 DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__avx2)
 DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__avx2)
 DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_3x8c8__avx2)
diff --git a/test/qc8-gemm-minmax-fp32.cc b/test/qc8-gemm-minmax-fp32.cc
index 7f20675..27d8db7 100644
--- a/test/qc8-gemm-minmax-fp32.cc
+++ b/test/qc8-gemm-minmax-fp32.cc
@@ -23,6 +23,25542 @@
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_eq_8) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_lt_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_gt_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_div_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, n_gt_4) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, n_div_4) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, n_div_4_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, qmin) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, qmax) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, strided_cm) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_eq_8) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_lt_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_gt_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_div_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, n_gt_4) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, n_div_4) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, n_div_4_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, qmin) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, qmax) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, strided_cm) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_eq_8) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t m = 1; m <= 3; m++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_lt_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_gt_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_div_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, n_gt_4) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, n_div_4) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, n_div_4_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, qmin) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, qmax) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD64, strided_cm) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_eq_8) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_lt_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_gt_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_div_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, n_gt_4) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, n_div_4) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, n_div_4_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, qmin) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, qmax) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, strided_cm) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_eq_8) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_lt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_gt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_div_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, n_gt_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, n_div_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, n_div_4_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, qmin) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, qmax) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, strided_cm) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_eq_8) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_lt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_gt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_div_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, n_gt_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, n_div_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, n_div_4_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, qmin) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, qmax) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, strided_cm) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_eq_8) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t m = 1; m <= 3; m++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_lt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_gt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_div_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, n_gt_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, n_div_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, n_div_4_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, qmin) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, qmax) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD64, strided_cm) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_eq_8) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_lt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_gt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_div_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, n_gt_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, n_div_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, n_div_4_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, qmin) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, qmax) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, strided_cm) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_eq_8) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_lt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_gt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_div_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, n_gt_4) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, n_div_4) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, n_div_4_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, qmin) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, qmax) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, strided_cm) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, k_eq_8) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, k_lt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, k_gt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, k_div_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, n_gt_4) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, n_div_4) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, n_div_4_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, qmin) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, qmax) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD64, strided_cm) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_eq_8) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t m = 1; m <= 3; m++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_lt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_gt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_div_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, n_gt_4) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, n_div_4) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, n_div_4_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, qmin) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, qmax) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD64, strided_cm) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_eq_8) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_lt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_gt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_div_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, n_gt_4) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, n_div_4) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, n_div_4_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, qmin) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, qmax) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD64, strided_cm) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_eq_8) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_lt_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_gt_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_div_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, n_gt_4) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, n_div_4) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, n_div_4_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, qmin) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, qmax) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, strided_cm) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_eq_8) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_lt_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_gt_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_div_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, n_gt_4) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, n_div_4) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, n_div_4_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, qmin) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, qmax) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD64, strided_cm) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, k_eq_8) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t m = 1; m <= 3; m++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, k_lt_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, k_gt_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, k_div_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, n_gt_4) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, n_div_4) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, n_div_4_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, qmin) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, qmax) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD64, strided_cm) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_eq_8) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_lt_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_gt_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_div_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, n_gt_4) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, n_div_4) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, n_div_4_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, qmin) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, qmax) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD64, strided_cm) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, k_eq_8) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, k_lt_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, k_gt_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, k_div_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, k_div_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, n_gt_4) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, n_div_4) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, n_div_4_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, strided_cm_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, qmin) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, qmax) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD128, strided_cm) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_eq_8) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_lt_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_gt_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_div_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_div_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, n_gt_4) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, n_div_4) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, n_div_4_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, strided_cm_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, qmin) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, qmax) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD128, strided_cm) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_eq_8) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t m = 1; m <= 3; m++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_lt_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_gt_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_div_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_div_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, n_gt_4) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, n_div_4) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, n_div_4_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, strided_cm_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, qmin) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, qmax) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, strided_cm) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_eq_8) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_lt_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_gt_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_div_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_div_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, n_gt_4) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, n_div_4) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, n_div_4_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, strided_cm_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, qmin) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, qmax) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD128, strided_cm) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_eq_8) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_lt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_gt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_div_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_div_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, n_gt_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, n_div_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, n_div_4_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, strided_cm_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, qmin) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, qmax) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, strided_cm) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_eq_8) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_lt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_gt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_div_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_div_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, n_gt_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, n_div_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, n_div_4_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, strided_cm_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, qmin) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, qmax) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, strided_cm) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_eq_8) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t m = 1; m <= 3; m++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_lt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_gt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_div_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_div_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, n_gt_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, n_div_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, n_div_4_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, strided_cm_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, qmin) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, qmax) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__SSE41_LD128, strided_cm) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_eq_8) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_lt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_gt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_div_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_div_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, n_gt_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, n_div_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, n_div_4_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, strided_cm_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, qmin) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, qmax) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD128, strided_cm) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_eq_8) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_lt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_gt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_div_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_div_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, n_gt_4) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, n_div_4) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, n_div_4_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, strided_cm_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, qmin) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, qmax) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, strided_cm) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_eq_8) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_lt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_gt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_div_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_div_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, n_gt_4) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, n_div_4) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, n_div_4_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, strided_cm_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, qmin) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, qmax) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, strided_cm) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_eq_8) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t m = 1; m <= 3; m++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_lt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_gt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_div_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_div_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, n_gt_4) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, n_div_4) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, n_div_4_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, strided_cm_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, qmin) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, qmax) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__AVX_LD128, strided_cm) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_eq_8) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_lt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_gt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_div_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_div_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, n_gt_4) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, n_div_4) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, n_div_4_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, strided_cm_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, qmin) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, qmax) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, strided_cm) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_eq_8) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_lt_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_gt_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_div_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_div_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, n_gt_4) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, n_div_4) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, n_div_4_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, strided_cm_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, qmin) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, qmax) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C2__XOP_LD128, strided_cm) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, k_eq_8) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, k_lt_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, k_gt_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, k_div_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, k_div_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, n_gt_4) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, n_div_4) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, n_div_4_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, strided_cm_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, qmin) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, qmax) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C2__XOP_LD128, strided_cm) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_eq_8) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t m = 1; m <= 3; m++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_lt_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_gt_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_div_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_div_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, n_gt_4) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, n_div_4) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, n_div_4_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, strided_cm_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, qmin) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, qmax) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, strided_cm) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_eq_8) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_lt_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_gt_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_div_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_div_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, n_gt_4) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, n_div_4) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, n_div_4_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, strided_cm_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, qmin) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, qmax) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X4C2__XOP_LD128, strided_cm) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_eq_8) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_lt_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_gt_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_div_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, n_gt_4) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, n_div_4) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, n_div_4_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, qmin) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, qmax) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD64, strided_cm) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, k_eq_8) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, k_lt_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, k_gt_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, k_div_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, n_gt_4) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, n_div_4) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, n_div_4_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, qmin) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, qmax) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD64, strided_cm) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_eq_8) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t m = 1; m <= 3; m++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_lt_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_gt_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_div_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, n_gt_4) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, n_div_4) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, n_div_4_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, qmin) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, qmax) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, strided_cm) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_eq_8) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_lt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_gt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_div_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, n_gt_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, n_div_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, n_div_4_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, qmin) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, qmax) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, strided_cm) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_eq_8) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_lt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_gt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_div_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, n_gt_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, n_div_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, n_div_4_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, qmin) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, qmax) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, strided_cm) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_eq_8) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t m = 1; m <= 3; m++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_lt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_gt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_div_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, n_gt_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, n_div_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, n_div_4_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, qmin) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, qmax) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD64, strided_cm) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_eq_8) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_lt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_gt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_div_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, n_gt_4) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, n_div_4) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, n_div_4_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, qmin) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, qmax) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, strided_cm) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_eq_8) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_lt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_gt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_div_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, n_gt_4) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, n_div_4) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, n_div_4_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, qmin) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, qmax) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD64, strided_cm) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, k_eq_8) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t m = 1; m <= 3; m++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, k_lt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, k_gt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, k_div_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, n_gt_4) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, n_div_4) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, n_div_4_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, qmin) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, qmax) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD64, strided_cm) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_eq_8) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_lt_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_gt_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_div_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, n_gt_4) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, n_div_4) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, n_div_4_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, qmin) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, qmax) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, strided_cm) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_eq_8) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_lt_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_gt_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_div_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, n_gt_4) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, n_div_4) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, n_div_4_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, qmin) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, qmax) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD64, strided_cm) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, k_eq_8) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t m = 1; m <= 3; m++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, k_lt_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, k_gt_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, k_div_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, n_gt_4) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, n_div_4) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, n_div_4_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, qmin) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, qmax) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD64, strided_cm) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_eq_8) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_lt_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_gt_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_div_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_div_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, n_gt_4) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, n_div_4) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, n_div_4_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, strided_cm_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, qmin) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, qmax) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE2_LD128, strided_cm) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, k_eq_8) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, k_lt_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, k_gt_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, k_div_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, k_div_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, n_gt_4) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, n_div_4) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, n_div_4_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, strided_cm_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, qmin) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, qmax) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE2_LD128, strided_cm) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_eq_8) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t m = 1; m <= 3; m++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_lt_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_gt_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_div_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_div_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, n_gt_4) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, n_div_4) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, n_div_4_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, strided_cm_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, qmin) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, qmax) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, strided_cm) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_eq_8) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_lt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_gt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_div_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_div_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, n_gt_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, n_div_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, n_div_4_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, strided_cm_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, qmin) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, qmax) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD128, strided_cm) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, k_eq_8) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, k_lt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, k_gt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, k_div_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, k_div_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, n_gt_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, n_div_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, n_div_4_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, strided_cm_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, qmin) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, qmax) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD128, strided_cm) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_eq_8) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t m = 1; m <= 3; m++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_lt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_gt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_div_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_div_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, n_gt_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, n_div_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, n_div_4_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, strided_cm_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, qmin) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, qmax) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, strided_cm) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_eq_8) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_lt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_gt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_div_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_div_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, n_gt_4) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, n_div_4) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, n_div_4_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, strided_cm_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, qmin) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, qmax) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__AVX_LD128, strided_cm) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_eq_8) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_lt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_gt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_div_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_div_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, n_gt_4) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, n_div_4) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, n_div_4_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, strided_cm_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, qmin) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, qmax) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__AVX_LD128, strided_cm) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_eq_8) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t m = 1; m <= 3; m++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_lt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_gt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_div_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_div_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, n_gt_4) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, n_div_4) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, n_div_4_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, strided_cm_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, qmin) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, qmax) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, strided_cm) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_eq_8) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_lt_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_gt_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_div_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_div_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, n_gt_4) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, n_div_4) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, n_div_4_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, strided_cm_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, qmin) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, qmax) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, strided_cm) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_eq_8) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_lt_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_gt_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_div_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_div_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, n_gt_4) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, n_div_4) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, n_div_4_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, strided_cm_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, qmin) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, qmax) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, strided_cm) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_eq_8) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t m = 1; m <= 3; m++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_lt_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_gt_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_div_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_div_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, n_gt_4) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, n_div_4) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, n_div_4_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, strided_cm_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, qmin) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, qmax) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_3X4C8__XOP_LD128, strided_cm) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   TEST(QC8_GEMM_MINMAX_FP32_1X8C8__AVX2, k_eq_8) {
     TEST_REQUIRES_X86_AVX2;
     GemmMicrokernelTester()
diff --git a/test/qc8-gemm-minmax-fp32.yaml b/test/qc8-gemm-minmax-fp32.yaml
index 98ffc1b..03a526d 100644
--- a/test/qc8-gemm-minmax-fp32.yaml
+++ b/test/qc8-gemm-minmax-fp32.yaml
@@ -3,6 +3,174 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64
+  init: xnn_init_qs8_minmax_sse2_params
+  k-block: 8
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64
+  init: xnn_init_qs8_minmax_sse2_params
+  k-block: 8
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64
+  init: xnn_init_qs8_minmax_sse2_params
+  k-block: 8
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64
+  init: xnn_init_qs8_minmax_sse2_params
+  k-block: 8
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128
+  init: xnn_init_qs8_minmax_sse2_params
+  k-block: 8
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128
+  init: xnn_init_qs8_minmax_sse2_params
+  k-block: 8
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128
+  init: xnn_init_qs8_minmax_sse2_params
+  k-block: 8
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128
+  init: xnn_init_qs8_minmax_sse2_params
+  k-block: 8
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64
+  init: xnn_init_qs8_minmax_sse2_params
+  k-block: 8
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64
+  init: xnn_init_qs8_minmax_sse2_params
+  k-block: 8
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64
+  init: xnn_init_qs8_minmax_sse2_params
+  k-block: 8
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128
+  init: xnn_init_qs8_minmax_sse2_params
+  k-block: 8
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128
+  init: xnn_init_qs8_minmax_sse2_params
+  k-block: 8
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128
+  init: xnn_init_qs8_minmax_sse2_params
+  k-block: 8
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
 - name: xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__avx2
   init: xnn_init_qs8_minmax_avx2_params
   k-block: 8
diff --git a/test/qc8-igemm-minmax-fp32.cc b/test/qc8-igemm-minmax-fp32.cc
index 150947e..da83fde 100644
--- a/test/qc8-igemm-minmax-fp32.cc
+++ b/test/qc8-igemm-minmax-fp32.cc
@@ -23,6 +23,26214 @@
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_eq_8) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD64, strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_lt_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_gt_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_div_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD64, n_gt_4) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD64, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD64, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD64, n_div_4) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD64, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD64, n_div_4_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD64, small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD64, small_kernel_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD64, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD64, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD64, a_offset) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(43)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD64, zero) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t mz = 0; mz < 1; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(43)
+          .zero_index(mz)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD64, qmin) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD64, qmax) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD64, strided_cm) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_eq_8) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD64, strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_lt_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_gt_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_div_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD64, n_gt_4) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD64, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD64, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD64, n_div_4) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD64, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD64, n_div_4_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD64, small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD64, small_kernel_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD64, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD64, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD64, a_offset) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(83)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD64, zero) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t mz = 0; mz < 2; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(83)
+          .zero_index(mz)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD64, qmin) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD64, qmax) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD64, strided_cm) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_eq_8) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD64, strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t m = 1; m <= 3; m++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_lt_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_gt_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_div_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD64, n_gt_4) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD64, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD64, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD64, n_div_4) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD64, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD64, n_div_4_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD64, small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD64, small_kernel_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD64, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD64, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD64, a_offset) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(127)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD64, zero) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t mz = 0; mz < 3; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(127)
+          .zero_index(mz)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD64, qmin) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD64, qmax) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD64, strided_cm) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_eq_8) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD64, strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_lt_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_gt_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_div_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD64, n_gt_4) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD64, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD64, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD64, n_div_4) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD64, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD64, n_div_4_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD64, small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD64, small_kernel_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD64, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD64, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD64, a_offset) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(163)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD64, zero) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t mz = 0; mz < 4; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(163)
+          .zero_index(mz)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD64, qmin) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD64, qmax) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD64, strided_cm) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_eq_8) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD64, strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_lt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_gt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_div_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD64, n_gt_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD64, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD64, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD64, n_div_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD64, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD64, n_div_4_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD64, small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD64, small_kernel_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD64, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD64, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD64, a_offset) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(43)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD64, zero) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t mz = 0; mz < 1; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(43)
+          .zero_index(mz)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD64, qmin) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD64, qmax) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD64, strided_cm) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_eq_8) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD64, strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_lt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_gt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_div_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD64, n_gt_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD64, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD64, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD64, n_div_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD64, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD64, n_div_4_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD64, small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD64, small_kernel_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD64, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD64, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD64, a_offset) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(83)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD64, zero) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t mz = 0; mz < 2; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(83)
+          .zero_index(mz)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD64, qmin) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD64, qmax) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD64, strided_cm) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_eq_8) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD64, strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t m = 1; m <= 3; m++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_lt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_gt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_div_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD64, n_gt_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD64, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD64, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD64, n_div_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD64, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD64, n_div_4_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD64, small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD64, small_kernel_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD64, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD64, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD64, a_offset) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(127)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD64, zero) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t mz = 0; mz < 3; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(127)
+          .zero_index(mz)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD64, qmin) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD64, qmax) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD64, strided_cm) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_eq_8) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD64, strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_lt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_gt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_div_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD64, n_gt_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD64, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD64, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD64, n_div_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD64, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD64, n_div_4_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD64, small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD64, small_kernel_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD64, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD64, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD64, a_offset) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(163)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD64, zero) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t mz = 0; mz < 4; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(163)
+          .zero_index(mz)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD64, qmin) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD64, qmax) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD64, strided_cm) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_eq_8) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD64, strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_lt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_gt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_div_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD64, n_gt_4) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD64, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD64, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD64, n_div_4) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD64, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD64, n_div_4_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD64, small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD64, small_kernel_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD64, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD64, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD64, a_offset) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(43)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD64, zero) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t mz = 0; mz < 1; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(43)
+          .zero_index(mz)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD64, qmin) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD64, qmax) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD64, strided_cm) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD64, k_eq_8) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD64, strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD64, k_lt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD64, k_gt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD64, k_div_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD64, n_gt_4) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD64, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD64, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD64, n_div_4) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD64, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD64, n_div_4_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD64, small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD64, small_kernel_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD64, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD64, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD64, a_offset) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(83)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD64, zero) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t mz = 0; mz < 2; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(83)
+          .zero_index(mz)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD64, qmin) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD64, qmax) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD64, strided_cm) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_eq_8) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD64, strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t m = 1; m <= 3; m++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_lt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_gt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_div_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD64, n_gt_4) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD64, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD64, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD64, n_div_4) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD64, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD64, n_div_4_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD64, small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD64, small_kernel_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD64, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD64, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD64, a_offset) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(127)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD64, zero) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t mz = 0; mz < 3; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(127)
+          .zero_index(mz)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD64, qmin) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD64, qmax) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD64, strided_cm) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_eq_8) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD64, strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_lt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_gt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_div_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD64, n_gt_4) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD64, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD64, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD64, n_div_4) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD64, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD64, n_div_4_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD64, small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD64, small_kernel_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD64, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD64, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD64, a_offset) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(163)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD64, zero) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t mz = 0; mz < 4; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(163)
+          .zero_index(mz)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD64, qmin) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD64, qmax) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD64, strided_cm) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_eq_8) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD64, strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_lt_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_gt_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_div_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD64, n_gt_4) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD64, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD64, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD64, n_div_4) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD64, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD64, n_div_4_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD64, small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD64, small_kernel_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD64, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD64, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD64, a_offset) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(43)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD64, zero) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t mz = 0; mz < 1; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(43)
+          .zero_index(mz)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD64, qmin) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD64, qmax) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD64, strided_cm) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_eq_8) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD64, strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_lt_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_gt_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_div_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD64, n_gt_4) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD64, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD64, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD64, n_div_4) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD64, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD64, n_div_4_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD64, small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD64, small_kernel_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD64, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD64, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD64, a_offset) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(83)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD64, zero) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t mz = 0; mz < 2; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(83)
+          .zero_index(mz)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD64, qmin) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD64, qmax) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD64, strided_cm) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD64, k_eq_8) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD64, strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t m = 1; m <= 3; m++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD64, k_lt_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD64, k_gt_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD64, k_div_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD64, n_gt_4) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD64, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD64, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD64, n_div_4) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD64, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD64, n_div_4_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD64, small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD64, small_kernel_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD64, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD64, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD64, a_offset) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(127)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD64, zero) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t mz = 0; mz < 3; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(127)
+          .zero_index(mz)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD64, qmin) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD64, qmax) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD64, strided_cm) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_eq_8) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD64, strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_lt_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_gt_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_div_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD64, n_gt_4) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD64, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD64, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD64, n_div_4) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD64, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD64, n_div_4_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD64, small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD64, small_kernel_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD64, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD64, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD64, a_offset) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(163)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD64, zero) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t mz = 0; mz < 4; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(163)
+          .zero_index(mz)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD64, qmin) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD64, qmax) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD64, strided_cm) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD128, k_eq_8) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD128, strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD128, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD128, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD128, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD128, k_lt_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD128, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD128, k_gt_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD128, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD128, k_div_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD128, k_div_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD128, n_gt_4) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD128, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD128, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD128, n_div_4) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD128, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD128, n_div_4_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD128, small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD128, small_kernel_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD128, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD128, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD128, strided_cm_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD128, a_offset) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(43)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD128, zero) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t mz = 0; mz < 1; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(43)
+          .zero_index(mz)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD128, qmin) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD128, qmax) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD128, strided_cm) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_eq_8) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD128, strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_lt_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_gt_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_div_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD128, k_div_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD128, n_gt_4) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD128, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD128, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD128, n_div_4) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD128, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD128, n_div_4_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD128, small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD128, small_kernel_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD128, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD128, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD128, strided_cm_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD128, a_offset) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(83)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD128, zero) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t mz = 0; mz < 2; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(83)
+          .zero_index(mz)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD128, qmin) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD128, qmax) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE2_LD128, strided_cm) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_eq_8) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD128, strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t m = 1; m <= 3; m++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_lt_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_gt_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_div_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_div_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD128, n_gt_4) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD128, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD128, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD128, n_div_4) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD128, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD128, n_div_4_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD128, small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD128, small_kernel_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD128, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD128, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD128, strided_cm_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD128, a_offset) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(127)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD128, zero) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t mz = 0; mz < 3; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(127)
+          .zero_index(mz)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD128, qmin) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD128, qmax) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE2_LD128, strided_cm) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_eq_8) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD128, strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_lt_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_gt_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_div_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD128, k_div_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD128, n_gt_4) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD128, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD128, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD128, n_div_4) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD128, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD128, n_div_4_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD128, small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD128, small_kernel_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD128, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD128, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD128, strided_cm_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD128, a_offset) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(163)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD128, zero) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t mz = 0; mz < 4; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(163)
+          .zero_index(mz)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD128, qmin) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD128, qmax) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE2_LD128, strided_cm) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_eq_8) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD128, strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_lt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_gt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_div_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_div_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD128, n_gt_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD128, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD128, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD128, n_div_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD128, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD128, n_div_4_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD128, small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD128, small_kernel_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD128, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD128, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD128, strided_cm_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD128, a_offset) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(43)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD128, zero) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t mz = 0; mz < 1; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(43)
+          .zero_index(mz)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD128, qmin) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD128, qmax) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__SSE41_LD128, strided_cm) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_eq_8) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD128, strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_lt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_gt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_div_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_div_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD128, n_gt_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD128, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD128, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD128, n_div_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD128, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD128, n_div_4_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD128, small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD128, small_kernel_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD128, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD128, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD128, strided_cm_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD128, a_offset) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(83)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD128, zero) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t mz = 0; mz < 2; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(83)
+          .zero_index(mz)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD128, qmin) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD128, qmax) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__SSE41_LD128, strided_cm) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_eq_8) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD128, strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t m = 1; m <= 3; m++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_lt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_gt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_div_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD128, k_div_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD128, n_gt_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD128, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD128, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD128, n_div_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD128, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD128, n_div_4_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD128, small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD128, small_kernel_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD128, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD128, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD128, strided_cm_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD128, a_offset) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(127)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD128, zero) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t mz = 0; mz < 3; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(127)
+          .zero_index(mz)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD128, qmin) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD128, qmax) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__SSE41_LD128, strided_cm) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_eq_8) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD128, strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_lt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_gt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_div_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD128, k_div_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD128, n_gt_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD128, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD128, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD128, n_div_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD128, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD128, n_div_4_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD128, small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD128, small_kernel_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD128, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD128, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD128, strided_cm_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD128, a_offset) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(163)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD128, zero) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t mz = 0; mz < 4; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(163)
+          .zero_index(mz)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD128, qmin) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD128, qmax) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__SSE41_LD128, strided_cm) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_eq_8) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD128, strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_lt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_gt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_div_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_div_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD128, n_gt_4) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD128, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD128, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD128, n_div_4) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD128, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD128, n_div_4_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD128, small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD128, small_kernel_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD128, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD128, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD128, strided_cm_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD128, a_offset) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(43)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD128, zero) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t mz = 0; mz < 1; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(43)
+          .zero_index(mz)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD128, qmin) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD128, qmax) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__AVX_LD128, strided_cm) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_eq_8) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD128, strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_lt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_gt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_div_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_div_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD128, n_gt_4) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD128, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD128, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD128, n_div_4) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD128, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD128, n_div_4_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD128, small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD128, small_kernel_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD128, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD128, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD128, strided_cm_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD128, a_offset) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(83)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD128, zero) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t mz = 0; mz < 2; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(83)
+          .zero_index(mz)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD128, qmin) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD128, qmax) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__AVX_LD128, strided_cm) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_eq_8) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD128, strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t m = 1; m <= 3; m++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_lt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_gt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_div_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD128, k_div_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD128, n_gt_4) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD128, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD128, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD128, n_div_4) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD128, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD128, n_div_4_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD128, small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD128, small_kernel_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD128, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD128, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD128, strided_cm_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD128, a_offset) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(127)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD128, zero) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t mz = 0; mz < 3; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(127)
+          .zero_index(mz)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD128, qmin) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD128, qmax) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__AVX_LD128, strided_cm) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_eq_8) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD128, strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_lt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_gt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_div_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_div_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD128, n_gt_4) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD128, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD128, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD128, n_div_4) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD128, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD128, n_div_4_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD128, small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD128, small_kernel_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD128, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD128, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD128, strided_cm_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD128, a_offset) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(163)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD128, zero) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t mz = 0; mz < 4; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(163)
+          .zero_index(mz)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD128, qmin) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD128, qmax) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__AVX_LD128, strided_cm) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_eq_8) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD128, strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_lt_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_gt_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_div_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD128, k_div_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD128, n_gt_4) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD128, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD128, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD128, n_div_4) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD128, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD128, n_div_4_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD128, small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD128, small_kernel_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD128, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD128, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD128, strided_cm_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD128, a_offset) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(43)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD128, zero) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t mz = 0; mz < 1; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(43)
+          .zero_index(mz)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD128, qmin) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD128, qmax) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C2__XOP_LD128, strided_cm) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD128, k_eq_8) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD128, strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD128, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD128, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD128, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD128, k_lt_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD128, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD128, k_gt_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD128, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD128, k_div_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD128, k_div_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD128, n_gt_4) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD128, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD128, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD128, n_div_4) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD128, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD128, n_div_4_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD128, small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD128, small_kernel_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD128, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD128, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD128, strided_cm_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD128, a_offset) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(83)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD128, zero) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t mz = 0; mz < 2; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(83)
+          .zero_index(mz)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD128, qmin) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD128, qmax) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C2__XOP_LD128, strided_cm) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_eq_8) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD128, strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t m = 1; m <= 3; m++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_lt_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_gt_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_div_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_div_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD128, n_gt_4) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD128, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD128, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD128, n_div_4) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD128, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD128, n_div_4_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD128, small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD128, small_kernel_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD128, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD128, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD128, strided_cm_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD128, a_offset) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(127)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD128, zero) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t mz = 0; mz < 3; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(127)
+          .zero_index(mz)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD128, qmin) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD128, qmax) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C2__XOP_LD128, strided_cm) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_eq_8) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD128, strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_lt_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_gt_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_div_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD128, k_div_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD128, n_gt_4) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD128, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD128, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD128, n_div_4) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD128, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD128, n_div_4_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD128, small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD128, small_kernel_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD128, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD128, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD128, strided_cm_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD128, a_offset) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(163)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD128, zero) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t mz = 0; mz < 4; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(163)
+          .zero_index(mz)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD128, qmin) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD128, qmax) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_4X4C2__XOP_LD128, strided_cm) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_eq_8) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD64, strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_lt_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_gt_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_div_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD64, n_gt_4) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD64, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD64, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD64, n_div_4) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD64, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD64, n_div_4_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD64, small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD64, small_kernel_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD64, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD64, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD64, a_offset) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(43)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD64, zero) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t mz = 0; mz < 1; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(43)
+          .zero_index(mz)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD64, qmin) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD64, qmax) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD64, strided_cm) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD64, k_eq_8) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD64, strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD64, k_lt_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD64, k_gt_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD64, k_div_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD64, n_gt_4) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD64, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD64, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD64, n_div_4) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD64, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD64, n_div_4_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD64, small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD64, small_kernel_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD64, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD64, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD64, a_offset) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(83)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD64, zero) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t mz = 0; mz < 2; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(83)
+          .zero_index(mz)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD64, qmin) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD64, qmax) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD64, strided_cm) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_eq_8) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD64, strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t m = 1; m <= 3; m++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_lt_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_gt_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_div_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD64, n_gt_4) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD64, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD64, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD64, n_div_4) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD64, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD64, n_div_4_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD64, small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD64, small_kernel_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD64, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD64, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD64, a_offset) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(127)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD64, zero) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t mz = 0; mz < 3; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(127)
+          .zero_index(mz)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD64, qmin) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD64, qmax) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD64, strided_cm) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_eq_8) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD64, strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_lt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_gt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_div_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD64, n_gt_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD64, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD64, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD64, n_div_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD64, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD64, n_div_4_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD64, small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD64, small_kernel_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD64, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD64, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD64, a_offset) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(43)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD64, zero) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t mz = 0; mz < 1; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(43)
+          .zero_index(mz)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD64, qmin) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD64, qmax) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD64, strided_cm) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_eq_8) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD64, strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_lt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_gt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_div_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD64, n_gt_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD64, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD64, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD64, n_div_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD64, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD64, n_div_4_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD64, small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD64, small_kernel_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD64, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD64, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD64, a_offset) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(83)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD64, zero) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t mz = 0; mz < 2; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(83)
+          .zero_index(mz)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD64, qmin) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD64, qmax) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD64, strided_cm) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_eq_8) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD64, strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t m = 1; m <= 3; m++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_lt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_gt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_div_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD64, n_gt_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD64, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD64, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD64, n_div_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD64, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD64, n_div_4_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD64, small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD64, small_kernel_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD64, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD64, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD64, a_offset) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(127)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD64, zero) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t mz = 0; mz < 3; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(127)
+          .zero_index(mz)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD64, qmin) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD64, qmax) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD64, strided_cm) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_eq_8) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD64, strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_lt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_gt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_div_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD64, n_gt_4) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD64, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD64, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD64, n_div_4) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD64, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD64, n_div_4_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD64, small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD64, small_kernel_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD64, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD64, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD64, a_offset) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(43)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD64, zero) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t mz = 0; mz < 1; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(43)
+          .zero_index(mz)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD64, qmin) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD64, qmax) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD64, strided_cm) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_eq_8) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD64, strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_lt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_gt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_div_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD64, n_gt_4) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD64, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD64, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD64, n_div_4) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD64, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD64, n_div_4_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD64, small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD64, small_kernel_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD64, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD64, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD64, a_offset) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(83)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD64, zero) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t mz = 0; mz < 2; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(83)
+          .zero_index(mz)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD64, qmin) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD64, qmax) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD64, strided_cm) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD64, k_eq_8) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD64, strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t m = 1; m <= 3; m++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD64, k_lt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD64, k_gt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD64, k_div_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD64, n_gt_4) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD64, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD64, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD64, n_div_4) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD64, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD64, n_div_4_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD64, small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD64, small_kernel_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD64, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD64, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD64, a_offset) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(127)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD64, zero) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t mz = 0; mz < 3; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(127)
+          .zero_index(mz)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD64, qmin) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD64, qmax) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD64, strided_cm) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_eq_8) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD64, strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_lt_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_gt_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_div_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD64, n_gt_4) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD64, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD64, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD64, n_div_4) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD64, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD64, n_div_4_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD64, small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD64, small_kernel_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD64, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD64, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD64, a_offset) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(43)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD64, zero) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t mz = 0; mz < 1; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(43)
+          .zero_index(mz)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD64, qmin) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD64, qmax) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD64, strided_cm) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_eq_8) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD64, strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_lt_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_gt_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_div_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD64, n_gt_4) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD64, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD64, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD64, n_div_4) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD64, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD64, n_div_4_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD64, small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD64, small_kernel_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD64, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD64, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD64, a_offset) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(83)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD64, zero) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t mz = 0; mz < 2; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(83)
+          .zero_index(mz)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD64, qmin) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD64, qmax) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD64, strided_cm) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD64, k_eq_8) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD64, strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t m = 1; m <= 3; m++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD64, k_lt_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD64, k_gt_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD64, k_div_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD64, n_gt_4) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD64, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD64, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD64, n_div_4) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD64, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD64, n_div_4_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD64, small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD64, small_kernel_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD64, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD64, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD64, a_offset) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(127)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD64, zero) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t mz = 0; mz < 3; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(127)
+          .zero_index(mz)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD64, qmin) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD64, qmax) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD64, strided_cm) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld64, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_eq_8) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD128, strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_lt_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_gt_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_div_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD128, k_div_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD128, n_gt_4) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD128, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD128, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD128, n_div_4) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD128, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD128, n_div_4_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD128, small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD128, small_kernel_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD128, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD128, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD128, strided_cm_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD128, a_offset) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(43)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD128, zero) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t mz = 0; mz < 1; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(43)
+          .zero_index(mz)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD128, qmin) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD128, qmax) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE2_LD128, strided_cm) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD128, k_eq_8) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD128, strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD128, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD128, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD128, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD128, k_lt_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD128, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD128, k_gt_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD128, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD128, k_div_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD128, k_div_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD128, n_gt_4) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD128, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD128, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD128, n_div_4) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD128, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD128, n_div_4_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD128, small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD128, small_kernel_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD128, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD128, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD128, strided_cm_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD128, a_offset) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(83)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD128, zero) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t mz = 0; mz < 2; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(83)
+          .zero_index(mz)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD128, qmin) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD128, qmax) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE2_LD128, strided_cm) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_eq_8) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD128, strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t m = 1; m <= 3; m++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_lt_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_gt_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_div_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_div_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD128, n_gt_4) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD128, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD128, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD128, n_div_4) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD128, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD128, n_div_4_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD128, small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD128, small_kernel_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD128, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD128, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD128, strided_cm_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD128, a_offset) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(127)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD128, zero) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t mz = 0; mz < 3; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(127)
+          .zero_index(mz)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD128, qmin) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD128, qmax) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE2_LD128, strided_cm) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qs8_minmax_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_eq_8) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD128, strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_lt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_gt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_div_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD128, k_div_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD128, n_gt_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD128, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD128, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD128, n_div_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD128, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD128, n_div_4_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD128, small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD128, small_kernel_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD128, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD128, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD128, strided_cm_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD128, a_offset) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(43)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD128, zero) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t mz = 0; mz < 1; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(43)
+          .zero_index(mz)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD128, qmin) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD128, qmax) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__SSE41_LD128, strided_cm) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD128, k_eq_8) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD128, strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD128, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD128, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD128, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD128, k_lt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD128, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD128, k_gt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD128, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD128, k_div_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD128, k_div_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD128, n_gt_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD128, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD128, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD128, n_div_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD128, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD128, n_div_4_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD128, small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD128, small_kernel_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD128, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD128, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD128, strided_cm_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD128, a_offset) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(83)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD128, zero) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t mz = 0; mz < 2; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(83)
+          .zero_index(mz)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD128, qmin) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD128, qmax) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__SSE41_LD128, strided_cm) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_eq_8) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD128, strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t m = 1; m <= 3; m++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_lt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_gt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_div_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_div_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD128, n_gt_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD128, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD128, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD128, n_div_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD128, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD128, n_div_4_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD128, small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD128, small_kernel_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD128, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD128, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD128, strided_cm_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD128, a_offset) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(127)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD128, zero) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t mz = 0; mz < 3; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(127)
+          .zero_index(mz)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD128, qmin) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD128, qmax) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__SSE41_LD128, strided_cm) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_eq_8) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD128, strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_lt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_gt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_div_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD128, k_div_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD128, n_gt_4) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD128, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD128, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD128, n_div_4) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD128, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD128, n_div_4_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD128, small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD128, small_kernel_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD128, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD128, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD128, strided_cm_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD128, a_offset) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(43)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD128, zero) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t mz = 0; mz < 1; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(43)
+          .zero_index(mz)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD128, qmin) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD128, qmax) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__AVX_LD128, strided_cm) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_eq_8) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD128, strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_lt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_gt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_div_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD128, k_div_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD128, n_gt_4) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD128, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD128, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD128, n_div_4) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD128, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD128, n_div_4_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD128, small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD128, small_kernel_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD128, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD128, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD128, strided_cm_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD128, a_offset) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(83)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD128, zero) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t mz = 0; mz < 2; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(83)
+          .zero_index(mz)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD128, qmin) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD128, qmax) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__AVX_LD128, strided_cm) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_eq_8) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD128, strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t m = 1; m <= 3; m++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_lt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_gt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_div_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_div_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD128, n_gt_4) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD128, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD128, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD128, n_div_4) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD128, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD128, n_div_4_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD128, small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD128, small_kernel_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD128, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD128, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD128, strided_cm_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD128, a_offset) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(127)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD128, zero) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t mz = 0; mz < 3; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(127)
+          .zero_index(mz)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD128, qmin) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD128, qmax) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__AVX_LD128, strided_cm) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_eq_8) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD128, strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_lt_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_gt_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_div_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_div_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD128, n_gt_4) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD128, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD128, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD128, n_div_4) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD128, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD128, n_div_4_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD128, small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD128, small_kernel_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD128, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD128, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD128, strided_cm_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD128, a_offset) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(43)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD128, zero) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t mz = 0; mz < 1; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(43)
+          .zero_index(mz)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD128, qmin) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD128, qmax) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_1X4C8__XOP_LD128, strided_cm) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_eq_8) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD128, strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_lt_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_gt_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_div_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_div_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD128, n_gt_4) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD128, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD128, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD128, n_div_4) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD128, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD128, n_div_4_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD128, small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD128, small_kernel_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD128, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD128, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD128, strided_cm_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD128, a_offset) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(83)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD128, zero) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t mz = 0; mz < 2; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(83)
+          .zero_index(mz)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD128, qmin) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD128, qmax) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X4C8__XOP_LD128, strided_cm) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_eq_8) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD128, strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t m = 1; m <= 3; m++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_lt_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_gt_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_div_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD128, k_div_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD128, n_gt_4) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD128, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD128, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD128, n_div_4) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD128, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD128, n_div_4_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD128, small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD128, small_kernel_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD128, n_gt_4_small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD128, n_div_4_small_kernel) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD128, strided_cm_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD128, a_offset) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .ks(3)
+        .a_offset(127)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD128, zero) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t mz = 0; mz < 3; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .ks(3)
+          .a_offset(127)
+          .zero_index(mz)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD128, qmin) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD128, qmax) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_3X4C8__XOP_LD128, strided_cm) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld128, xnn_init_qs8_minmax_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   TEST(QC8_IGEMM_MINMAX_FP32_1X8C8__AVX2, k_eq_8) {
     TEST_REQUIRES_X86_AVX2;
     GemmMicrokernelTester()
diff --git a/test/qc8-igemm-minmax-fp32.yaml b/test/qc8-igemm-minmax-fp32.yaml
index 8624c29..35ce783 100644
--- a/test/qc8-igemm-minmax-fp32.yaml
+++ b/test/qc8-igemm-minmax-fp32.yaml
@@ -3,6 +3,174 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64
+  init: xnn_init_qs8_minmax_sse2_params
+  k-block: 8
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld64
+  init: xnn_init_qs8_minmax_sse2_params
+  k-block: 8
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld64
+  init: xnn_init_qs8_minmax_sse2_params
+  k-block: 8
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld64
+  init: xnn_init_qs8_minmax_sse2_params
+  k-block: 8
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld64
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld64
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld64
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld64
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld64
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld64
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld64
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128
+  init: xnn_init_qs8_minmax_sse2_params
+  k-block: 8
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld128
+  init: xnn_init_qs8_minmax_sse2_params
+  k-block: 8
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld128
+  init: xnn_init_qs8_minmax_sse2_params
+  k-block: 8
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld128
+  init: xnn_init_qs8_minmax_sse2_params
+  k-block: 8
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld128
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld128
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld128
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld128
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld128
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld128
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld128
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64
+  init: xnn_init_qs8_minmax_sse2_params
+  k-block: 8
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld64
+  init: xnn_init_qs8_minmax_sse2_params
+  k-block: 8
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64
+  init: xnn_init_qs8_minmax_sse2_params
+  k-block: 8
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld64
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld64
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld64
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld64
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128
+  init: xnn_init_qs8_minmax_sse2_params
+  k-block: 8
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld128
+  init: xnn_init_qs8_minmax_sse2_params
+  k-block: 8
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld128
+  init: xnn_init_qs8_minmax_sse2_params
+  k-block: 8
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld128
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld128
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld128
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld128
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld128
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld128
+  init: xnn_init_qs8_minmax_sse4_params
+  k-block: 8
 - name: xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__avx2
   init: xnn_init_qs8_minmax_avx2_params
   k-block: 8