Use FP32 requantization for extended-weights QS8 GEMM microkernels on x86

PiperOrigin-RevId: 389282162
diff --git a/BUILD.bazel b/BUILD.bazel
index adecd7e..0c9dd8f 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -3637,27 +3637,27 @@
     "src/qs8-gavgpool/gen/7x-minmax-sse2-c24-acc2.c",
     "src/qs8-gemm/gen/1x4c2-minmax-fp32-sse2-ld64.c",
     "src/qs8-gemm/gen/1x4c2-minmax-fp32-sse2-ld128.c",
-    "src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-sse2.c",
+    "src/qs8-gemm/gen/1x4c2-xw-minmax-fp32-sse2.c",
     "src/qs8-gemm/gen/1x4c8-minmax-fp32-sse2-ld64.c",
     "src/qs8-gemm/gen/1x4c8-minmax-fp32-sse2-ld128.c",
-    "src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-sse2.c",
+    "src/qs8-gemm/gen/1x4c8-xw-minmax-fp32-sse2.c",
     "src/qs8-gemm/gen/2x4c2-minmax-fp32-sse2-ld64.c",
     "src/qs8-gemm/gen/2x4c2-minmax-fp32-sse2-ld128.c",
-    "src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-sse2.c",
+    "src/qs8-gemm/gen/2x4c2-xw-minmax-fp32-sse2.c",
     "src/qs8-gemm/gen/2x4c8-minmax-fp32-sse2-ld64.c",
     "src/qs8-gemm/gen/2x4c8-minmax-fp32-sse2-ld128.c",
     "src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-sse2-ld64.c",
-    "src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-sse2.c",
+    "src/qs8-gemm/gen/2x4c8-xw-minmax-fp32-sse2.c",
     "src/qs8-gemm/gen/3x4c2-minmax-fp32-sse2-ld64.c",
     "src/qs8-gemm/gen/3x4c2-minmax-fp32-sse2-ld128.c",
-    "src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-sse2.c",
+    "src/qs8-gemm/gen/3x4c2-xw-minmax-fp32-sse2.c",
     "src/qs8-gemm/gen/3x4c8-minmax-fp32-sse2-ld64.c",
     "src/qs8-gemm/gen/3x4c8-minmax-fp32-sse2-ld128.c",
-    "src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-sse2.c",
+    "src/qs8-gemm/gen/3x4c8-xw-minmax-fp32-sse2.c",
     "src/qs8-gemm/gen/4x4c2-minmax-fp32-sse2-ld64.c",
     "src/qs8-gemm/gen/4x4c2-minmax-fp32-sse2-ld128.c",
     "src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-sse2-ld64.c",
-    "src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-sse2.c",
+    "src/qs8-gemm/gen/4x4c2-xw-minmax-fp32-sse2.c",
     "src/qs8-igemm/gen/1x4c2-minmax-fp32-sse2-ld64.c",
     "src/qs8-igemm/gen/1x4c2-minmax-fp32-sse2-ld128.c",
     "src/qs8-igemm/gen/1x4c8-minmax-fp32-sse2-ld64.c",
@@ -3783,21 +3783,18 @@
     "src/qs8-gavgpool/gen/7x-minmax-ssse3-c8-acc2.c",
     "src/qs8-gavgpool/gen/7x-minmax-ssse3-c16-acc2.c",
     "src/qs8-gavgpool/gen/7x-minmax-ssse3-c24-acc2.c",
-    "src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-ssse3.c",
+    "src/qs8-gemm/gen/1x4c2-xw-minmax-fp32-ssse3.c",
     "src/qs8-gemm/gen/1x4c8-minmax-fp32-ssse3-ld64.c",
     "src/qs8-gemm/gen/1x4c8-minmax-fp32-ssse3-ld128.c",
-    "src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-ssse3.c",
-    "src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-ssse3.c",
+    "src/qs8-gemm/gen/1x4c8-xw-minmax-fp32-ssse3.c",
     "src/qs8-gemm/gen/2x4c8-minmax-fp32-ssse3-ld64.c",
     "src/qs8-gemm/gen/2x4c8-minmax-fp32-ssse3-ld128.c",
     "src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-ssse3-ld64.c",
-    "src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-ssse3.c",
-    "src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-ssse3.c",
+    "src/qs8-gemm/gen/2x4c8-xw-minmax-fp32-ssse3.c",
     "src/qs8-gemm/gen/3x4c8-minmax-fp32-ssse3-ld64.c",
     "src/qs8-gemm/gen/3x4c8-minmax-fp32-ssse3-ld128.c",
-    "src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-ssse3.c",
+    "src/qs8-gemm/gen/3x4c8-xw-minmax-fp32-ssse3.c",
     "src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-ssse3-ld64.c",
-    "src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-ssse3.c",
     "src/qs8-igemm/gen/1x4c8-minmax-fp32-ssse3-ld64.c",
     "src/qs8-igemm/gen/1x4c8-minmax-fp32-ssse3-ld128.c",
     "src/qs8-igemm/gen/2x4c8-minmax-fp32-ssse3-ld64.c",
@@ -3975,27 +3972,27 @@
     "src/qs8-gavgpool/gen/7x-minmax-sse41-c24-acc2.c",
     "src/qs8-gemm/gen/1x4c2-minmax-fp32-sse41-ld64.c",
     "src/qs8-gemm/gen/1x4c2-minmax-fp32-sse41-ld128.c",
-    "src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-sse41.c",
+    "src/qs8-gemm/gen/1x4c2-xw-minmax-fp32-sse41.c",
     "src/qs8-gemm/gen/1x4c8-minmax-fp32-sse41-ld64.c",
     "src/qs8-gemm/gen/1x4c8-minmax-fp32-sse41-ld128.c",
-    "src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-sse41.c",
+    "src/qs8-gemm/gen/1x4c8-xw-minmax-fp32-sse41.c",
     "src/qs8-gemm/gen/2x4c2-minmax-fp32-sse41-ld64.c",
     "src/qs8-gemm/gen/2x4c2-minmax-fp32-sse41-ld128.c",
-    "src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-sse41.c",
+    "src/qs8-gemm/gen/2x4c2-xw-minmax-fp32-sse41.c",
     "src/qs8-gemm/gen/2x4c8-minmax-fp32-sse41-ld64.c",
     "src/qs8-gemm/gen/2x4c8-minmax-fp32-sse41-ld128.c",
     "src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-sse41-ld64.c",
-    "src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-sse41.c",
+    "src/qs8-gemm/gen/2x4c8-xw-minmax-fp32-sse41.c",
     "src/qs8-gemm/gen/3x4c2-minmax-fp32-sse41-ld64.c",
     "src/qs8-gemm/gen/3x4c2-minmax-fp32-sse41-ld128.c",
-    "src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-sse41.c",
+    "src/qs8-gemm/gen/3x4c2-xw-minmax-fp32-sse41.c",
     "src/qs8-gemm/gen/3x4c8-minmax-fp32-sse41-ld64.c",
     "src/qs8-gemm/gen/3x4c8-minmax-fp32-sse41-ld128.c",
-    "src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-sse41.c",
+    "src/qs8-gemm/gen/3x4c8-xw-minmax-fp32-sse41.c",
     "src/qs8-gemm/gen/4x4c2-minmax-fp32-sse41-ld64.c",
     "src/qs8-gemm/gen/4x4c2-minmax-fp32-sse41-ld128.c",
     "src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-sse41-ld64.c",
-    "src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-sse41.c",
+    "src/qs8-gemm/gen/4x4c2-xw-minmax-fp32-sse41.c",
     "src/qs8-igemm/gen/1x4c2-minmax-fp32-sse41-ld64.c",
     "src/qs8-igemm/gen/1x4c2-minmax-fp32-sse41-ld128.c",
     "src/qs8-igemm/gen/1x4c8-minmax-fp32-sse41-ld64.c",
@@ -4379,25 +4376,25 @@
     "src/qs8-dwconv/gen/up24x25-minmax-gemmlowp-avx-mul32.c",
     "src/qs8-gemm/gen/1x4c2-minmax-fp32-avx-ld64.c",
     "src/qs8-gemm/gen/1x4c2-minmax-fp32-avx-ld128.c",
-    "src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-avx.c",
+    "src/qs8-gemm/gen/1x4c2-xw-minmax-fp32-avx.c",
     "src/qs8-gemm/gen/1x4c8-minmax-fp32-avx-ld64.c",
     "src/qs8-gemm/gen/1x4c8-minmax-fp32-avx-ld128.c",
-    "src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-avx.c",
+    "src/qs8-gemm/gen/1x4c8-xw-minmax-fp32-avx.c",
     "src/qs8-gemm/gen/2x4c2-minmax-fp32-avx-ld64.c",
     "src/qs8-gemm/gen/2x4c2-minmax-fp32-avx-ld128.c",
-    "src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-avx.c",
+    "src/qs8-gemm/gen/2x4c2-xw-minmax-fp32-avx.c",
     "src/qs8-gemm/gen/2x4c8-minmax-fp32-avx-ld64.c",
     "src/qs8-gemm/gen/2x4c8-minmax-fp32-avx-ld128.c",
-    "src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-avx.c",
+    "src/qs8-gemm/gen/2x4c8-xw-minmax-fp32-avx.c",
     "src/qs8-gemm/gen/3x4c2-minmax-fp32-avx-ld64.c",
     "src/qs8-gemm/gen/3x4c2-minmax-fp32-avx-ld128.c",
-    "src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-avx.c",
+    "src/qs8-gemm/gen/3x4c2-xw-minmax-fp32-avx.c",
     "src/qs8-gemm/gen/3x4c8-minmax-fp32-avx-ld64.c",
     "src/qs8-gemm/gen/3x4c8-minmax-fp32-avx-ld128.c",
-    "src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-avx.c",
+    "src/qs8-gemm/gen/3x4c8-xw-minmax-fp32-avx.c",
     "src/qs8-gemm/gen/4x4c2-minmax-fp32-avx-ld64.c",
     "src/qs8-gemm/gen/4x4c2-minmax-fp32-avx-ld128.c",
-    "src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-avx.c",
+    "src/qs8-gemm/gen/4x4c2-xw-minmax-fp32-avx.c",
     "src/qs8-igemm/gen/1x4c2-minmax-fp32-avx-ld64.c",
     "src/qs8-igemm/gen/1x4c2-minmax-fp32-avx-ld128.c",
     "src/qs8-igemm/gen/1x4c8-minmax-fp32-avx-ld64.c",
@@ -4564,25 +4561,25 @@
     "src/qs8-dwconv/gen/up24x25-minmax-gemmlowp-xop-mul32.c",
     "src/qs8-gemm/gen/1x4c2-minmax-fp32-xop-ld64.c",
     "src/qs8-gemm/gen/1x4c2-minmax-fp32-xop-ld128.c",
-    "src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-xop.c",
+    "src/qs8-gemm/gen/1x4c2-xw-minmax-fp32-xop.c",
     "src/qs8-gemm/gen/1x4c8-minmax-fp32-xop-ld64.c",
     "src/qs8-gemm/gen/1x4c8-minmax-fp32-xop-ld128.c",
-    "src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-xop.c",
+    "src/qs8-gemm/gen/1x4c8-xw-minmax-fp32-xop.c",
     "src/qs8-gemm/gen/2x4c2-minmax-fp32-xop-ld64.c",
     "src/qs8-gemm/gen/2x4c2-minmax-fp32-xop-ld128.c",
-    "src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-xop.c",
+    "src/qs8-gemm/gen/2x4c2-xw-minmax-fp32-xop.c",
     "src/qs8-gemm/gen/2x4c8-minmax-fp32-xop-ld64.c",
     "src/qs8-gemm/gen/2x4c8-minmax-fp32-xop-ld128.c",
-    "src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-xop.c",
+    "src/qs8-gemm/gen/2x4c8-xw-minmax-fp32-xop.c",
     "src/qs8-gemm/gen/3x4c2-minmax-fp32-xop-ld64.c",
     "src/qs8-gemm/gen/3x4c2-minmax-fp32-xop-ld128.c",
-    "src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-xop.c",
+    "src/qs8-gemm/gen/3x4c2-xw-minmax-fp32-xop.c",
     "src/qs8-gemm/gen/3x4c8-minmax-fp32-xop-ld64.c",
     "src/qs8-gemm/gen/3x4c8-minmax-fp32-xop-ld128.c",
-    "src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-xop.c",
+    "src/qs8-gemm/gen/3x4c8-xw-minmax-fp32-xop.c",
     "src/qs8-gemm/gen/4x4c2-minmax-fp32-xop-ld64.c",
     "src/qs8-gemm/gen/4x4c2-minmax-fp32-xop-ld128.c",
-    "src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-xop.c",
+    "src/qs8-gemm/gen/4x4c2-xw-minmax-fp32-xop.c",
     "src/qs8-igemm/gen/1x4c2-minmax-fp32-xop-ld64.c",
     "src/qs8-igemm/gen/1x4c2-minmax-fp32-xop-ld128.c",
     "src/qs8-igemm/gen/1x4c8-minmax-fp32-xop-ld64.c",
@@ -4982,15 +4979,12 @@
     "src/qs8-gemm/gen/1x8c8-minmax-fp32-avx2.c",
     "src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-avx2.c",
     "src/qs8-gemm/gen/1x8c8-xw-minmax-fp32-avx2.c",
-    "src/qs8-gemm/gen/1x8c8-xw-minmax-gemmlowp-avx2.c",
     "src/qs8-gemm/gen/2x8c8-minmax-fp32-avx2.c",
     "src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-avx2.c",
     "src/qs8-gemm/gen/2x8c8-xw-minmax-fp32-avx2.c",
-    "src/qs8-gemm/gen/2x8c8-xw-minmax-gemmlowp-avx2.c",
     "src/qs8-gemm/gen/3x8c8-minmax-fp32-avx2.c",
     "src/qs8-gemm/gen/3x8c8-minmax-gemmlowp-avx2.c",
     "src/qs8-gemm/gen/3x8c8-xw-minmax-fp32-avx2.c",
-    "src/qs8-gemm/gen/3x8c8-xw-minmax-gemmlowp-avx2.c",
     "src/qs8-igemm/gen/1x8c8-minmax-fp32-avx2.c",
     "src/qs8-igemm/gen/1x8c8-minmax-gemmlowp-avx2.c",
     "src/qs8-igemm/gen/2x8c8-minmax-fp32-avx2.c",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7f11dd7..6d1ce3a 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2802,27 +2802,27 @@
   src/qs8-gavgpool/gen/7x-minmax-sse2-c24-acc2.c
   src/qs8-gemm/gen/1x4c2-minmax-fp32-sse2-ld64.c
   src/qs8-gemm/gen/1x4c2-minmax-fp32-sse2-ld128.c
-  src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-sse2.c
+  src/qs8-gemm/gen/1x4c2-xw-minmax-fp32-sse2.c
   src/qs8-gemm/gen/1x4c8-minmax-fp32-sse2-ld64.c
   src/qs8-gemm/gen/1x4c8-minmax-fp32-sse2-ld128.c
-  src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-sse2.c
+  src/qs8-gemm/gen/1x4c8-xw-minmax-fp32-sse2.c
   src/qs8-gemm/gen/2x4c2-minmax-fp32-sse2-ld64.c
   src/qs8-gemm/gen/2x4c2-minmax-fp32-sse2-ld128.c
-  src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-sse2.c
+  src/qs8-gemm/gen/2x4c2-xw-minmax-fp32-sse2.c
   src/qs8-gemm/gen/2x4c8-minmax-fp32-sse2-ld64.c
   src/qs8-gemm/gen/2x4c8-minmax-fp32-sse2-ld128.c
   src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-sse2-ld64.c
-  src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-sse2.c
+  src/qs8-gemm/gen/2x4c8-xw-minmax-fp32-sse2.c
   src/qs8-gemm/gen/3x4c2-minmax-fp32-sse2-ld64.c
   src/qs8-gemm/gen/3x4c2-minmax-fp32-sse2-ld128.c
-  src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-sse2.c
+  src/qs8-gemm/gen/3x4c2-xw-minmax-fp32-sse2.c
   src/qs8-gemm/gen/3x4c8-minmax-fp32-sse2-ld64.c
   src/qs8-gemm/gen/3x4c8-minmax-fp32-sse2-ld128.c
-  src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-sse2.c
+  src/qs8-gemm/gen/3x4c8-xw-minmax-fp32-sse2.c
   src/qs8-gemm/gen/4x4c2-minmax-fp32-sse2-ld64.c
   src/qs8-gemm/gen/4x4c2-minmax-fp32-sse2-ld128.c
   src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-sse2-ld64.c
-  src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-sse2.c
+  src/qs8-gemm/gen/4x4c2-xw-minmax-fp32-sse2.c
   src/qs8-igemm/gen/1x4c2-minmax-fp32-sse2-ld64.c
   src/qs8-igemm/gen/1x4c2-minmax-fp32-sse2-ld128.c
   src/qs8-igemm/gen/1x4c8-minmax-fp32-sse2-ld64.c
@@ -2946,21 +2946,18 @@
   src/qs8-gavgpool/gen/7x-minmax-ssse3-c8-acc2.c
   src/qs8-gavgpool/gen/7x-minmax-ssse3-c16-acc2.c
   src/qs8-gavgpool/gen/7x-minmax-ssse3-c24-acc2.c
-  src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-ssse3.c
+  src/qs8-gemm/gen/1x4c2-xw-minmax-fp32-ssse3.c
   src/qs8-gemm/gen/1x4c8-minmax-fp32-ssse3-ld64.c
   src/qs8-gemm/gen/1x4c8-minmax-fp32-ssse3-ld128.c
-  src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-ssse3.c
-  src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-ssse3.c
+  src/qs8-gemm/gen/1x4c8-xw-minmax-fp32-ssse3.c
   src/qs8-gemm/gen/2x4c8-minmax-fp32-ssse3-ld64.c
   src/qs8-gemm/gen/2x4c8-minmax-fp32-ssse3-ld128.c
   src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-ssse3-ld64.c
-  src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-ssse3.c
-  src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-ssse3.c
+  src/qs8-gemm/gen/2x4c8-xw-minmax-fp32-ssse3.c
   src/qs8-gemm/gen/3x4c8-minmax-fp32-ssse3-ld64.c
   src/qs8-gemm/gen/3x4c8-minmax-fp32-ssse3-ld128.c
-  src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-ssse3.c
+  src/qs8-gemm/gen/3x4c8-xw-minmax-fp32-ssse3.c
   src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-ssse3-ld64.c
-  src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-ssse3.c
   src/qs8-igemm/gen/1x4c8-minmax-fp32-ssse3-ld64.c
   src/qs8-igemm/gen/1x4c8-minmax-fp32-ssse3-ld128.c
   src/qs8-igemm/gen/2x4c8-minmax-fp32-ssse3-ld64.c
@@ -3136,27 +3133,27 @@
   src/qs8-gavgpool/gen/7x-minmax-sse41-c24-acc2.c
   src/qs8-gemm/gen/1x4c2-minmax-fp32-sse41-ld64.c
   src/qs8-gemm/gen/1x4c2-minmax-fp32-sse41-ld128.c
-  src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-sse41.c
+  src/qs8-gemm/gen/1x4c2-xw-minmax-fp32-sse41.c
   src/qs8-gemm/gen/1x4c8-minmax-fp32-sse41-ld64.c
   src/qs8-gemm/gen/1x4c8-minmax-fp32-sse41-ld128.c
-  src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-sse41.c
+  src/qs8-gemm/gen/1x4c8-xw-minmax-fp32-sse41.c
   src/qs8-gemm/gen/2x4c2-minmax-fp32-sse41-ld64.c
   src/qs8-gemm/gen/2x4c2-minmax-fp32-sse41-ld128.c
-  src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-sse41.c
+  src/qs8-gemm/gen/2x4c2-xw-minmax-fp32-sse41.c
   src/qs8-gemm/gen/2x4c8-minmax-fp32-sse41-ld64.c
   src/qs8-gemm/gen/2x4c8-minmax-fp32-sse41-ld128.c
   src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-sse41-ld64.c
-  src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-sse41.c
+  src/qs8-gemm/gen/2x4c8-xw-minmax-fp32-sse41.c
   src/qs8-gemm/gen/3x4c2-minmax-fp32-sse41-ld64.c
   src/qs8-gemm/gen/3x4c2-minmax-fp32-sse41-ld128.c
-  src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-sse41.c
+  src/qs8-gemm/gen/3x4c2-xw-minmax-fp32-sse41.c
   src/qs8-gemm/gen/3x4c8-minmax-fp32-sse41-ld64.c
   src/qs8-gemm/gen/3x4c8-minmax-fp32-sse41-ld128.c
-  src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-sse41.c
+  src/qs8-gemm/gen/3x4c8-xw-minmax-fp32-sse41.c
   src/qs8-gemm/gen/4x4c2-minmax-fp32-sse41-ld64.c
   src/qs8-gemm/gen/4x4c2-minmax-fp32-sse41-ld128.c
   src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-sse41-ld64.c
-  src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-sse41.c
+  src/qs8-gemm/gen/4x4c2-xw-minmax-fp32-sse41.c
   src/qs8-igemm/gen/1x4c2-minmax-fp32-sse41-ld64.c
   src/qs8-igemm/gen/1x4c2-minmax-fp32-sse41-ld128.c
   src/qs8-igemm/gen/1x4c8-minmax-fp32-sse41-ld64.c
@@ -3536,25 +3533,25 @@
   src/qs8-dwconv/gen/up24x25-minmax-gemmlowp-avx-mul32.c
   src/qs8-gemm/gen/1x4c2-minmax-fp32-avx-ld64.c
   src/qs8-gemm/gen/1x4c2-minmax-fp32-avx-ld128.c
-  src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-avx.c
+  src/qs8-gemm/gen/1x4c2-xw-minmax-fp32-avx.c
   src/qs8-gemm/gen/1x4c8-minmax-fp32-avx-ld64.c
   src/qs8-gemm/gen/1x4c8-minmax-fp32-avx-ld128.c
-  src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-avx.c
+  src/qs8-gemm/gen/1x4c8-xw-minmax-fp32-avx.c
   src/qs8-gemm/gen/2x4c2-minmax-fp32-avx-ld64.c
   src/qs8-gemm/gen/2x4c2-minmax-fp32-avx-ld128.c
-  src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-avx.c
+  src/qs8-gemm/gen/2x4c2-xw-minmax-fp32-avx.c
   src/qs8-gemm/gen/2x4c8-minmax-fp32-avx-ld64.c
   src/qs8-gemm/gen/2x4c8-minmax-fp32-avx-ld128.c
-  src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-avx.c
+  src/qs8-gemm/gen/2x4c8-xw-minmax-fp32-avx.c
   src/qs8-gemm/gen/3x4c2-minmax-fp32-avx-ld64.c
   src/qs8-gemm/gen/3x4c2-minmax-fp32-avx-ld128.c
-  src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-avx.c
+  src/qs8-gemm/gen/3x4c2-xw-minmax-fp32-avx.c
   src/qs8-gemm/gen/3x4c8-minmax-fp32-avx-ld64.c
   src/qs8-gemm/gen/3x4c8-minmax-fp32-avx-ld128.c
-  src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-avx.c
+  src/qs8-gemm/gen/3x4c8-xw-minmax-fp32-avx.c
   src/qs8-gemm/gen/4x4c2-minmax-fp32-avx-ld64.c
   src/qs8-gemm/gen/4x4c2-minmax-fp32-avx-ld128.c
-  src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-avx.c
+  src/qs8-gemm/gen/4x4c2-xw-minmax-fp32-avx.c
   src/qs8-igemm/gen/1x4c2-minmax-fp32-avx-ld64.c
   src/qs8-igemm/gen/1x4c2-minmax-fp32-avx-ld128.c
   src/qs8-igemm/gen/1x4c8-minmax-fp32-avx-ld64.c
@@ -3719,25 +3716,25 @@
   src/qs8-dwconv/gen/up24x25-minmax-gemmlowp-xop-mul32.c
   src/qs8-gemm/gen/1x4c2-minmax-fp32-xop-ld64.c
   src/qs8-gemm/gen/1x4c2-minmax-fp32-xop-ld128.c
-  src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-xop.c
+  src/qs8-gemm/gen/1x4c2-xw-minmax-fp32-xop.c
   src/qs8-gemm/gen/1x4c8-minmax-fp32-xop-ld64.c
   src/qs8-gemm/gen/1x4c8-minmax-fp32-xop-ld128.c
-  src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-xop.c
+  src/qs8-gemm/gen/1x4c8-xw-minmax-fp32-xop.c
   src/qs8-gemm/gen/2x4c2-minmax-fp32-xop-ld64.c
   src/qs8-gemm/gen/2x4c2-minmax-fp32-xop-ld128.c
-  src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-xop.c
+  src/qs8-gemm/gen/2x4c2-xw-minmax-fp32-xop.c
   src/qs8-gemm/gen/2x4c8-minmax-fp32-xop-ld64.c
   src/qs8-gemm/gen/2x4c8-minmax-fp32-xop-ld128.c
-  src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-xop.c
+  src/qs8-gemm/gen/2x4c8-xw-minmax-fp32-xop.c
   src/qs8-gemm/gen/3x4c2-minmax-fp32-xop-ld64.c
   src/qs8-gemm/gen/3x4c2-minmax-fp32-xop-ld128.c
-  src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-xop.c
+  src/qs8-gemm/gen/3x4c2-xw-minmax-fp32-xop.c
   src/qs8-gemm/gen/3x4c8-minmax-fp32-xop-ld64.c
   src/qs8-gemm/gen/3x4c8-minmax-fp32-xop-ld128.c
-  src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-xop.c
+  src/qs8-gemm/gen/3x4c8-xw-minmax-fp32-xop.c
   src/qs8-gemm/gen/4x4c2-minmax-fp32-xop-ld64.c
   src/qs8-gemm/gen/4x4c2-minmax-fp32-xop-ld128.c
-  src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-xop.c
+  src/qs8-gemm/gen/4x4c2-xw-minmax-fp32-xop.c
   src/qs8-igemm/gen/1x4c2-minmax-fp32-xop-ld64.c
   src/qs8-igemm/gen/1x4c2-minmax-fp32-xop-ld128.c
   src/qs8-igemm/gen/1x4c8-minmax-fp32-xop-ld64.c
@@ -4135,15 +4132,12 @@
   src/qs8-gemm/gen/1x8c8-minmax-fp32-avx2.c
   src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-avx2.c
   src/qs8-gemm/gen/1x8c8-xw-minmax-fp32-avx2.c
-  src/qs8-gemm/gen/1x8c8-xw-minmax-gemmlowp-avx2.c
   src/qs8-gemm/gen/2x8c8-minmax-fp32-avx2.c
   src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-avx2.c
   src/qs8-gemm/gen/2x8c8-xw-minmax-fp32-avx2.c
-  src/qs8-gemm/gen/2x8c8-xw-minmax-gemmlowp-avx2.c
   src/qs8-gemm/gen/3x8c8-minmax-fp32-avx2.c
   src/qs8-gemm/gen/3x8c8-minmax-gemmlowp-avx2.c
   src/qs8-gemm/gen/3x8c8-xw-minmax-fp32-avx2.c
-  src/qs8-gemm/gen/3x8c8-xw-minmax-gemmlowp-avx2.c
   src/qs8-igemm/gen/1x8c8-minmax-fp32-avx2.c
   src/qs8-igemm/gen/1x8c8-minmax-gemmlowp-avx2.c
   src/qs8-igemm/gen/2x8c8-minmax-fp32-avx2.c
diff --git a/bench/qs8-gemm.cc b/bench/qs8-gemm.cc
index 2f15a3b..e560d6d 100644
--- a/bench/qs8-gemm.cc
+++ b/bench/qs8-gemm.cc
@@ -624,13 +624,13 @@
       xnn_init_qs8_conv_minmax_fp32_avx2_params, benchmark::utils::CheckAVX2);
   }
 
-  static void qs8_gemm_xw_2x8c8_gemmlowp__avx2(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x8c8__avx2, 2, 8, 8, 1,
-      xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, benchmark::utils::CheckAVX2, true);
+  static void qs8_gemm_xw_2x8c8__avx2(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x8c8__avx2, 2, 8, 8, 1,
+      xnn_init_qs8_conv_minmax_fp32_avx2_params, benchmark::utils::CheckAVX2, true);
   }
-  static void qs8_gemm_xw_3x8c8_gemmlowp__avx2(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x8c8__avx2, 3, 8, 8, 1,
-      xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, benchmark::utils::CheckAVX2, true);
+  static void qs8_gemm_xw_3x8c8__avx2(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x8c8__avx2, 3, 8, 8, 1,
+      xnn_init_qs8_conv_minmax_fp32_avx2_params, benchmark::utils::CheckAVX2, true);
   }
 
   static void qs8_gemm_2x4c2__xop_ld64(benchmark::State& state, const char* net) {
@@ -659,17 +659,17 @@
       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
   }
 
-  static void qs8_gemm_xw_2x4c2_gemmlowp__xop(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c2__xop, 2, 4, 2, 1,
-      xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckXOP, true);
+  static void qs8_gemm_xw_2x4c2__xop(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__xop, 2, 4, 2, 1,
+      xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP, true);
   }
-  static void qs8_gemm_xw_3x4c2_gemmlowp__xop(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c2__xop, 3, 4, 2, 1,
-      xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckXOP, true);
+  static void qs8_gemm_xw_3x4c2__xop(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__xop, 3, 4, 2, 1,
+      xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP, true);
   }
-  static void qs8_gemm_xw_4x4c2_gemmlowp__xop(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__xop, 4, 4, 2, 1,
-      xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckXOP, true);
+  static void qs8_gemm_xw_4x4c2__xop(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop, 4, 4, 2, 1,
+      xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP, true);
   }
 
   static void qs8_gemm_2x4c8__xop_ld64(benchmark::State& state, const char* net) {
@@ -690,13 +690,13 @@
       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
   }
 
-  static void qs8_gemm_xw_2x4c8_gemmlowp__xop(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__xop, 2, 4, 8, 1,
-      xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckXOP, true);
+  static void qs8_gemm_xw_2x4c8__xop(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__xop, 2, 4, 8, 1,
+      xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP, true);
   }
-  static void qs8_gemm_xw_3x4c8_gemmlowp__xop(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__xop, 3, 4, 8, 1,
-      xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckXOP, true);
+  static void qs8_gemm_xw_3x4c8__xop(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__xop, 3, 4, 8, 1,
+      xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP, true);
   }
 
   static void qs8_gemm_2x4c2__avx_ld64(benchmark::State& state, const char* net) {
@@ -725,17 +725,17 @@
       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
   }
 
-  static void qs8_gemm_xw_2x4c2_gemmlowp__avx(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c2__avx, 2, 4, 2, 1,
-      xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckAVX, true);
+  static void qs8_gemm_xw_2x4c2__avx(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__avx, 2, 4, 2, 1,
+      xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX, true);
   }
-  static void qs8_gemm_xw_3x4c2_gemmlowp__avx(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c2__avx, 3, 4, 2, 1,
-      xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckAVX, true);
+  static void qs8_gemm_xw_3x4c2__avx(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__avx, 3, 4, 2, 1,
+      xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX, true);
   }
-  static void qs8_gemm_xw_4x4c2_gemmlowp__avx(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__avx, 4, 4, 2, 1,
-      xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckAVX, true);
+  static void qs8_gemm_xw_4x4c2__avx(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx, 4, 4, 2, 1,
+      xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX, true);
   }
 
   static void qs8_gemm_2x4c8__avx_ld64(benchmark::State& state, const char* net) {
@@ -756,13 +756,13 @@
       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
   }
 
-  static void qs8_gemm_xw_2x4c8_gemmlowp__avx(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__avx, 2, 4, 8, 1,
-      xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckAVX, true);
+  static void qs8_gemm_xw_2x4c8__avx(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__avx, 2, 4, 8, 1,
+      xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX, true);
   }
-  static void qs8_gemm_xw_3x4c8_gemmlowp__avx(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__avx, 3, 4, 8, 1,
-      xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckAVX, true);
+  static void qs8_gemm_xw_3x4c8__avx(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__avx, 3, 4, 8, 1,
+      xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX, true);
   }
 
   static void qs8_gemm_2x4c2__sse41_ld64(benchmark::State& state, const char* net) {
@@ -791,17 +791,17 @@
       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41);
   }
 
-  static void qs8_gemm_xw_2x4c2_gemmlowp__sse41(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c2__sse41, 2, 4, 2, 1,
-      xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckSSE41, true);
+  static void qs8_gemm_xw_2x4c2__sse41(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse41, 2, 4, 2, 1,
+      xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41, true);
   }
-  static void qs8_gemm_xw_3x4c2_gemmlowp__sse41(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c2__sse41, 3, 4, 2, 1,
-      xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckSSE41, true);
+  static void qs8_gemm_xw_3x4c2__sse41(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse41, 3, 4, 2, 1,
+      xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41, true);
   }
-  static void qs8_gemm_xw_4x4c2_gemmlowp__sse41(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse41, 4, 4, 2, 1,
-      xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckSSE41, true);
+  static void qs8_gemm_xw_4x4c2__sse41(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41, 4, 4, 2, 1,
+      xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41, true);
   }
 
   static void qs8_gemm_2x4c8__sse41_ld64(benchmark::State& state, const char* net) {
@@ -822,26 +822,13 @@
       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41);
   }
 
-  static void qs8_gemm_xw_2x4c8_gemmlowp__sse41(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse41, 2, 4, 8, 1,
-      xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckSSE41, true);
+  static void qs8_gemm_xw_2x4c8__sse41(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse41, 2, 4, 8, 1,
+      xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41, true);
   }
-  static void qs8_gemm_xw_3x4c8_gemmlowp__sse41(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse41, 3, 4, 8, 1,
-      xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckSSE41, true);
-  }
-
-  static void qs8_gemm_xw_2x4c2_gemmlowp__ssse3(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c2__ssse3, 2, 4, 2, 1,
-      xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, benchmark::utils::CheckSSSE3, true);
-  }
-  static void qs8_gemm_xw_3x4c2_gemmlowp__ssse3(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c2__ssse3, 3, 4, 2, 1,
-      xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, benchmark::utils::CheckSSSE3, true);
-  }
-  static void qs8_gemm_xw_4x4c2_gemmlowp__ssse3(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__ssse3, 4, 4, 2, 1,
-      xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, benchmark::utils::CheckSSSE3, true);
+  static void qs8_gemm_xw_3x4c8__sse41(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse41, 3, 4, 8, 1,
+      xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41, true);
   }
 
   static void qs8_gemm_2x4c8__ssse3_ld64(benchmark::State& state, const char* net) {
@@ -862,13 +849,13 @@
       xnn_init_qs8_conv_minmax_fp32_sse2_params, benchmark::utils::CheckSSSE3);
   }
 
-  static void qs8_gemm_xw_2x4c8_gemmlowp__ssse3(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__ssse3, 2, 4, 8, 1,
-      xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, benchmark::utils::CheckSSSE3, true);
+  static void qs8_gemm_xw_2x4c8__ssse3(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__ssse3, 2, 4, 8, 1,
+      xnn_init_qs8_conv_minmax_fp32_sse2_params, benchmark::utils::CheckSSSE3, true);
   }
-  static void qs8_gemm_xw_3x4c8_gemmlowp__ssse3(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__ssse3, 3, 4, 8, 1,
-      xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, benchmark::utils::CheckSSSE3, true);
+  static void qs8_gemm_xw_3x4c8__ssse3(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__ssse3, 3, 4, 8, 1,
+      xnn_init_qs8_conv_minmax_fp32_sse2_params, benchmark::utils::CheckSSSE3, true);
   }
 
   static void qs8_gemm_2x4c2__sse2_ld64(benchmark::State& state, const char* net) {
@@ -897,17 +884,17 @@
       xnn_init_qs8_conv_minmax_fp32_sse2_params);
   }
 
-  static void qs8_gemm_xw_2x4c2_gemmlowp__sse2(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c2__sse2, 2, 4, 2, 1,
-      xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, nullptr, true);
+  static void qs8_gemm_xw_2x4c2__sse2(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse2, 2, 4, 2, 1,
+      xnn_init_qs8_conv_minmax_fp32_sse2_params, nullptr, true);
   }
-  static void qs8_gemm_xw_3x4c2_gemmlowp__sse2(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c2__sse2, 3, 4, 2, 1,
-      xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, nullptr, true);
+  static void qs8_gemm_xw_3x4c2__sse2(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse2, 3, 4, 2, 1,
+      xnn_init_qs8_conv_minmax_fp32_sse2_params, nullptr, true);
   }
-  static void qs8_gemm_xw_4x4c2_gemmlowp__sse2(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse2, 4, 4, 2, 1,
-      xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, nullptr, true);
+  static void qs8_gemm_xw_4x4c2__sse2(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse2, 4, 4, 2, 1,
+      xnn_init_qs8_conv_minmax_fp32_sse2_params, nullptr, true);
   }
 
   static void qs8_gemm_2x4c8__sse2_ld64(benchmark::State& state, const char* net) {
@@ -928,13 +915,13 @@
       xnn_init_qs8_conv_minmax_fp32_sse2_params);
   }
 
-  static void qs8_gemm_xw_2x4c8_gemmlowp__sse2(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse2, 2, 4, 8, 1,
-      xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, nullptr, true);
+  static void qs8_gemm_xw_2x4c8__sse2(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse2, 2, 4, 8, 1,
+      xnn_init_qs8_conv_minmax_fp32_sse2_params, nullptr, true);
   }
-  static void qs8_gemm_xw_3x4c8_gemmlowp__sse2(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse2, 3, 4, 8, 1,
-      xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, nullptr, true);
+  static void qs8_gemm_xw_3x4c8__sse2(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse2, 3, 4, 8, 1,
+      xnn_init_qs8_conv_minmax_fp32_sse2_params, nullptr, true);
   }
 
   BENCHMARK_GEMM(qs8_gemm_2x16c8__avx512skx)
@@ -943,8 +930,8 @@
 
   BENCHMARK_GEMM(qs8_gemm_2x8c8__avx2)
   BENCHMARK_GEMM(qs8_gemm_3x8c8__avx2)
-  BENCHMARK_GEMM(qs8_gemm_xw_2x8c8_gemmlowp__avx2)
-  BENCHMARK_GEMM(qs8_gemm_xw_3x8c8_gemmlowp__avx2)
+  BENCHMARK_GEMM(qs8_gemm_xw_2x8c8__avx2)
+  BENCHMARK_GEMM(qs8_gemm_xw_3x8c8__avx2)
 
   BENCHMARK_GEMM(qs8_gemm_2x4c2__xop_ld64)
   BENCHMARK_GEMM(qs8_gemm_3x4c2__xop_ld64)
@@ -952,15 +939,15 @@
   BENCHMARK_GEMM(qs8_gemm_2x4c2__xop_ld128)
   BENCHMARK_GEMM(qs8_gemm_3x4c2__xop_ld128)
   BENCHMARK_GEMM(qs8_gemm_4x4c2__xop_ld128)
-  BENCHMARK_GEMM(qs8_gemm_xw_2x4c2_gemmlowp__xop)
-  BENCHMARK_GEMM(qs8_gemm_xw_3x4c2_gemmlowp__xop)
-  BENCHMARK_GEMM(qs8_gemm_xw_4x4c2_gemmlowp__xop)
+  BENCHMARK_GEMM(qs8_gemm_xw_2x4c2__xop)
+  BENCHMARK_GEMM(qs8_gemm_xw_3x4c2__xop)
+  BENCHMARK_GEMM(qs8_gemm_xw_4x4c2__xop)
   BENCHMARK_GEMM(qs8_gemm_2x4c8__xop_ld64)
   BENCHMARK_GEMM(qs8_gemm_3x4c8__xop_ld64)
   BENCHMARK_GEMM(qs8_gemm_2x4c8__xop_ld128)
   BENCHMARK_GEMM(qs8_gemm_3x4c8__xop_ld128)
-  BENCHMARK_GEMM(qs8_gemm_xw_2x4c8_gemmlowp__xop)
-  BENCHMARK_GEMM(qs8_gemm_xw_3x4c8_gemmlowp__xop)
+  BENCHMARK_GEMM(qs8_gemm_xw_2x4c8__xop)
+  BENCHMARK_GEMM(qs8_gemm_xw_3x4c8__xop)
 
   BENCHMARK_GEMM(qs8_gemm_2x4c2__avx_ld64)
   BENCHMARK_GEMM(qs8_gemm_3x4c2__avx_ld64)
@@ -968,15 +955,15 @@
   BENCHMARK_GEMM(qs8_gemm_2x4c2__avx_ld128)
   BENCHMARK_GEMM(qs8_gemm_3x4c2__avx_ld128)
   BENCHMARK_GEMM(qs8_gemm_4x4c2__avx_ld128)
-  BENCHMARK_GEMM(qs8_gemm_xw_2x4c2_gemmlowp__avx)
-  BENCHMARK_GEMM(qs8_gemm_xw_3x4c2_gemmlowp__avx)
-  BENCHMARK_GEMM(qs8_gemm_xw_4x4c2_gemmlowp__avx)
+  BENCHMARK_GEMM(qs8_gemm_xw_2x4c2__avx)
+  BENCHMARK_GEMM(qs8_gemm_xw_3x4c2__avx)
+  BENCHMARK_GEMM(qs8_gemm_xw_4x4c2__avx)
   BENCHMARK_GEMM(qs8_gemm_2x4c8__avx_ld64)
   BENCHMARK_GEMM(qs8_gemm_3x4c8__avx_ld64)
   BENCHMARK_GEMM(qs8_gemm_2x4c8__avx_ld128)
   BENCHMARK_GEMM(qs8_gemm_3x4c8__avx_ld128)
-  BENCHMARK_GEMM(qs8_gemm_xw_2x4c8_gemmlowp__avx)
-  BENCHMARK_GEMM(qs8_gemm_xw_3x4c8_gemmlowp__avx)
+  BENCHMARK_GEMM(qs8_gemm_xw_2x4c8__avx)
+  BENCHMARK_GEMM(qs8_gemm_xw_3x4c8__avx)
 
   BENCHMARK_GEMM(qs8_gemm_2x4c2__sse41_ld64)
   BENCHMARK_GEMM(qs8_gemm_3x4c2__sse41_ld64)
@@ -984,25 +971,22 @@
   BENCHMARK_GEMM(qs8_gemm_2x4c2__sse41_ld128)
   BENCHMARK_GEMM(qs8_gemm_3x4c2__sse41_ld128)
   BENCHMARK_GEMM(qs8_gemm_4x4c2__sse41_ld128)
-  BENCHMARK_GEMM(qs8_gemm_xw_2x4c2_gemmlowp__sse41)
-  BENCHMARK_GEMM(qs8_gemm_xw_3x4c2_gemmlowp__sse41)
-  BENCHMARK_GEMM(qs8_gemm_xw_4x4c2_gemmlowp__sse41)
+  BENCHMARK_GEMM(qs8_gemm_xw_2x4c2__sse41)
+  BENCHMARK_GEMM(qs8_gemm_xw_3x4c2__sse41)
+  BENCHMARK_GEMM(qs8_gemm_xw_4x4c2__sse41)
   BENCHMARK_GEMM(qs8_gemm_2x4c8__sse41_ld64)
   BENCHMARK_GEMM(qs8_gemm_3x4c8__sse41_ld64)
   BENCHMARK_GEMM(qs8_gemm_2x4c8__sse41_ld128)
   BENCHMARK_GEMM(qs8_gemm_3x4c8__sse41_ld128)
-  BENCHMARK_GEMM(qs8_gemm_xw_2x4c8_gemmlowp__sse41)
-  BENCHMARK_GEMM(qs8_gemm_xw_3x4c8_gemmlowp__sse41)
+  BENCHMARK_GEMM(qs8_gemm_xw_2x4c8__sse41)
+  BENCHMARK_GEMM(qs8_gemm_xw_3x4c8__sse41)
 
-  BENCHMARK_GEMM(qs8_gemm_xw_2x4c2_gemmlowp__ssse3)
-  BENCHMARK_GEMM(qs8_gemm_xw_3x4c2_gemmlowp__ssse3)
-  BENCHMARK_GEMM(qs8_gemm_xw_4x4c2_gemmlowp__ssse3)
   BENCHMARK_GEMM(qs8_gemm_2x4c8__ssse3_ld64)
   BENCHMARK_GEMM(qs8_gemm_3x4c8__ssse3_ld64)
   BENCHMARK_GEMM(qs8_gemm_2x4c8__ssse3_ld128)
   BENCHMARK_GEMM(qs8_gemm_3x4c8__ssse3_ld128)
-  BENCHMARK_GEMM(qs8_gemm_xw_2x4c8_gemmlowp__ssse3)
-  BENCHMARK_GEMM(qs8_gemm_xw_3x4c8_gemmlowp__ssse3)
+  BENCHMARK_GEMM(qs8_gemm_xw_2x4c8__ssse3)
+  BENCHMARK_GEMM(qs8_gemm_xw_3x4c8__ssse3)
 
   BENCHMARK_GEMM(qs8_gemm_2x4c2__sse2_ld64)
   BENCHMARK_GEMM(qs8_gemm_3x4c2__sse2_ld64)
@@ -1010,15 +994,15 @@
   BENCHMARK_GEMM(qs8_gemm_2x4c2__sse2_ld128)
   BENCHMARK_GEMM(qs8_gemm_3x4c2__sse2_ld128)
   BENCHMARK_GEMM(qs8_gemm_4x4c2__sse2_ld128)
-  BENCHMARK_GEMM(qs8_gemm_xw_2x4c2_gemmlowp__sse2)
-  BENCHMARK_GEMM(qs8_gemm_xw_3x4c2_gemmlowp__sse2)
-  BENCHMARK_GEMM(qs8_gemm_xw_4x4c2_gemmlowp__sse2)
+  BENCHMARK_GEMM(qs8_gemm_xw_2x4c2__sse2)
+  BENCHMARK_GEMM(qs8_gemm_xw_3x4c2__sse2)
+  BENCHMARK_GEMM(qs8_gemm_xw_4x4c2__sse2)
   BENCHMARK_GEMM(qs8_gemm_2x4c8__sse2_ld64)
   BENCHMARK_GEMM(qs8_gemm_3x4c8__sse2_ld64)
   BENCHMARK_GEMM(qs8_gemm_2x4c8__sse2_ld128)
   BENCHMARK_GEMM(qs8_gemm_3x4c8__sse2_ld128)
-  BENCHMARK_GEMM(qs8_gemm_xw_2x4c8_gemmlowp__sse2)
-  BENCHMARK_GEMM(qs8_gemm_xw_3x4c8_gemmlowp__sse2)
+  BENCHMARK_GEMM(qs8_gemm_xw_2x4c8__sse2)
+  BENCHMARK_GEMM(qs8_gemm_xw_3x4c8__sse2)
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
diff --git a/scripts/generate-qs8-gemm.sh b/scripts/generate-qs8-gemm.sh
index fef2379..a847184 100755
--- a/scripts/generate-qs8-gemm.sh
+++ b/scripts/generate-qs8-gemm.sh
@@ -512,30 +512,25 @@
 tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=1 -D DATATYPE=QU8 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qu8-gemm/gen/3x4c2-minmax-fp32-xop-ld128.c
 tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=1 -D XOP=1 -D DATATYPE=QU8 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qu8-gemm/gen/4x4c2-minmax-fp32-xop-ld128.c
 
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=1 -D SSE=2 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-sse2.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=2 -D SSE=2 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-sse2.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=3 -D SSE=2 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-sse2.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=4 -D SSE=2 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-sse2.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=1 -D SSE=2 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=FP32     -D VARIANT=EXTENDED -o src/qs8-gemm/gen/1x4c2-xw-minmax-fp32-sse2.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=2 -D SSE=2 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=FP32     -D VARIANT=EXTENDED -o src/qs8-gemm/gen/2x4c2-xw-minmax-fp32-sse2.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=3 -D SSE=2 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=FP32     -D VARIANT=EXTENDED -o src/qs8-gemm/gen/3x4c2-xw-minmax-fp32-sse2.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=4 -D SSE=2 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=FP32     -D VARIANT=EXTENDED -o src/qs8-gemm/gen/4x4c2-xw-minmax-fp32-sse2.c
 
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=1 -D SSE=3 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-ssse3.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=2 -D SSE=3 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-ssse3.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=3 -D SSE=3 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-ssse3.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=4 -D SSE=3 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-ssse3.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=FP32     -D VARIANT=EXTENDED -o src/qs8-gemm/gen/1x4c2-xw-minmax-fp32-sse41.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=FP32     -D VARIANT=EXTENDED -o src/qs8-gemm/gen/2x4c2-xw-minmax-fp32-sse41.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=FP32     -D VARIANT=EXTENDED -o src/qs8-gemm/gen/3x4c2-xw-minmax-fp32-sse41.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=FP32     -D VARIANT=EXTENDED -o src/qs8-gemm/gen/4x4c2-xw-minmax-fp32-sse41.c
 
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-sse41.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-sse41.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-sse41.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-sse41.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=FP32     -D VARIANT=EXTENDED -o src/qs8-gemm/gen/1x4c2-xw-minmax-fp32-avx.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=FP32     -D VARIANT=EXTENDED -o src/qs8-gemm/gen/2x4c2-xw-minmax-fp32-avx.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=FP32     -D VARIANT=EXTENDED -o src/qs8-gemm/gen/3x4c2-xw-minmax-fp32-avx.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=1 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=FP32     -D VARIANT=EXTENDED -o src/qs8-gemm/gen/4x4c2-xw-minmax-fp32-avx.c
 
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-avx.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-avx.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-avx.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=1 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-avx.c
-
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=1 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-xop.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=1 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-xop.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=1 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-xop.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=1 -D XOP=1 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-xop.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=1 -D DATATYPE=QS8 -D REQUANTIZATION=FP32     -D VARIANT=EXTENDED -o src/qs8-gemm/gen/1x4c2-xw-minmax-fp32-xop.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=1 -D DATATYPE=QS8 -D REQUANTIZATION=FP32     -D VARIANT=EXTENDED -o src/qs8-gemm/gen/2x4c2-xw-minmax-fp32-xop.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=1 -D DATATYPE=QS8 -D REQUANTIZATION=FP32     -D VARIANT=EXTENDED -o src/qs8-gemm/gen/3x4c2-xw-minmax-fp32-xop.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=1 -D XOP=1 -D DATATYPE=QS8 -D REQUANTIZATION=FP32     -D VARIANT=EXTENDED -o src/qs8-gemm/gen/4x4c2-xw-minmax-fp32-xop.c
 
 ### C8 micro-kernels
 tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=2 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64     -o src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-sse2-ld64.c
@@ -651,32 +646,28 @@
 tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=1 -D DATATYPE=QU8 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qu8-gemm/gen/2x4c8-minmax-fp32-xop-ld128.c
 tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=1 -D DATATYPE=QU8 -D REQUANTIZATION=FP32     -D VARIANT=LD128    -o src/qu8-gemm/gen/3x4c8-minmax-fp32-xop-ld128.c
 
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=2 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-sse2.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=2 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-sse2.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=2 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-sse2.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=2 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=FP32     -D VARIANT=EXTENDED -o src/qs8-gemm/gen/1x4c8-xw-minmax-fp32-sse2.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=2 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=FP32     -D VARIANT=EXTENDED -o src/qs8-gemm/gen/2x4c8-xw-minmax-fp32-sse2.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=2 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=FP32     -D VARIANT=EXTENDED -o src/qs8-gemm/gen/3x4c8-xw-minmax-fp32-sse2.c
 
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=3 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-ssse3.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=3 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-ssse3.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=3 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-ssse3.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=3 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=FP32     -D VARIANT=EXTENDED -o src/qs8-gemm/gen/1x4c8-xw-minmax-fp32-ssse3.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=3 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=FP32     -D VARIANT=EXTENDED -o src/qs8-gemm/gen/2x4c8-xw-minmax-fp32-ssse3.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=3 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=FP32     -D VARIANT=EXTENDED -o src/qs8-gemm/gen/3x4c8-xw-minmax-fp32-ssse3.c
 
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-sse41.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-sse41.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-sse41.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=FP32     -D VARIANT=EXTENDED -o src/qs8-gemm/gen/1x4c8-xw-minmax-fp32-sse41.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=FP32     -D VARIANT=EXTENDED -o src/qs8-gemm/gen/2x4c8-xw-minmax-fp32-sse41.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=FP32     -D VARIANT=EXTENDED -o src/qs8-gemm/gen/3x4c8-xw-minmax-fp32-sse41.c
 
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-avx.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-avx.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-avx.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=FP32     -D VARIANT=EXTENDED -o src/qs8-gemm/gen/1x4c8-xw-minmax-fp32-avx.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=FP32     -D VARIANT=EXTENDED -o src/qs8-gemm/gen/2x4c8-xw-minmax-fp32-avx.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=FP32     -D VARIANT=EXTENDED -o src/qs8-gemm/gen/3x4c8-xw-minmax-fp32-avx.c
 
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=1 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-xop.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=1 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-xop.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=1 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-xop.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=1 -D DATATYPE=QS8 -D REQUANTIZATION=FP32     -D VARIANT=EXTENDED -o src/qs8-gemm/gen/1x4c8-xw-minmax-fp32-xop.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=1 -D DATATYPE=QS8 -D REQUANTIZATION=FP32     -D VARIANT=EXTENDED -o src/qs8-gemm/gen/2x4c8-xw-minmax-fp32-xop.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=1 -D DATATYPE=QS8 -D REQUANTIZATION=FP32     -D VARIANT=EXTENDED -o src/qs8-gemm/gen/3x4c8-xw-minmax-fp32-xop.c
 
 ################################### x86 AVX2 ##################################
 ### C8 micro-kernels
-tools/xngen src/qs8-gemm/MRx8c8-avx2.c.in -D MR=1 -D VARIANT=LD128    -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -o src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-avx2.c
-tools/xngen src/qs8-gemm/MRx8c8-avx2.c.in -D MR=2 -D VARIANT=LD128    -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -o src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-avx2.c
-tools/xngen src/qs8-gemm/MRx8c8-avx2.c.in -D MR=3 -D VARIANT=LD128    -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -o src/qs8-gemm/gen/3x8c8-minmax-gemmlowp-avx2.c
-
 tools/xngen src/qs8-gemm/MRx8c8-avx2.c.in -D MR=1 -D VARIANT=LD128    -D DATATYPE=QC8 -D REQUANTIZATION=FP32     -o src/qc8-gemm/gen/1x8c8-minmax-fp32-avx2.c
 tools/xngen src/qs8-gemm/MRx8c8-avx2.c.in -D MR=2 -D VARIANT=LD128    -D DATATYPE=QC8 -D REQUANTIZATION=FP32     -o src/qc8-gemm/gen/2x8c8-minmax-fp32-avx2.c
 tools/xngen src/qs8-gemm/MRx8c8-avx2.c.in -D MR=3 -D VARIANT=LD128    -D DATATYPE=QC8 -D REQUANTIZATION=FP32     -o src/qc8-gemm/gen/3x8c8-minmax-fp32-avx2.c
@@ -689,10 +680,6 @@
 tools/xngen src/qs8-gemm/MRx8c8-avx2.c.in -D MR=2 -D VARIANT=LD128    -D DATATYPE=QU8 -D REQUANTIZATION=FP32     -o src/qu8-gemm/gen/2x8c8-minmax-fp32-avx2.c
 tools/xngen src/qs8-gemm/MRx8c8-avx2.c.in -D MR=3 -D VARIANT=LD128    -D DATATYPE=QU8 -D REQUANTIZATION=FP32     -o src/qu8-gemm/gen/3x8c8-minmax-fp32-avx2.c
 
-tools/xngen src/qs8-gemm/MRx8c8-avx2.c.in -D MR=1 -D VARIANT=EXTENDED -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -o src/qs8-gemm/gen/1x8c8-xw-minmax-gemmlowp-avx2.c
-tools/xngen src/qs8-gemm/MRx8c8-avx2.c.in -D MR=2 -D VARIANT=EXTENDED -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -o src/qs8-gemm/gen/2x8c8-xw-minmax-gemmlowp-avx2.c
-tools/xngen src/qs8-gemm/MRx8c8-avx2.c.in -D MR=3 -D VARIANT=EXTENDED -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -o src/qs8-gemm/gen/3x8c8-xw-minmax-gemmlowp-avx2.c
-
 tools/xngen src/qs8-gemm/MRx8c8-avx2.c.in -D MR=1 -D VARIANT=EXTENDED -D DATATYPE=QC8 -D REQUANTIZATION=FP32     -o src/qc8-gemm/gen/1x8c8-xw-minmax-fp32-avx2.c
 tools/xngen src/qs8-gemm/MRx8c8-avx2.c.in -D MR=2 -D VARIANT=EXTENDED -D DATATYPE=QC8 -D REQUANTIZATION=FP32     -o src/qc8-gemm/gen/2x8c8-xw-minmax-fp32-avx2.c
 tools/xngen src/qs8-gemm/MRx8c8-avx2.c.in -D MR=3 -D VARIANT=EXTENDED -D DATATYPE=QC8 -D REQUANTIZATION=FP32     -o src/qc8-gemm/gen/3x8c8-xw-minmax-fp32-avx2.c
diff --git a/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-avx.c b/src/qs8-gemm/gen/1x4c2-xw-minmax-fp32-avx.c
similarity index 73%
copy from src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-avx.c
copy to src/qs8-gemm/gen/1x4c2-xw-minmax-fp32-avx.c
index b649211..108c9b3 100644
--- a/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-avx.c
+++ b/src/qs8-gemm/gen/1x4c2-xw-minmax-fp32-avx.c
@@ -16,7 +16,7 @@
 
 
 
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__avx(
+void xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__avx(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -99,37 +99,21 @@
       }
     }
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.rounding);
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
 
-    const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
+    const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
 
-    const __m128i vprod0x02 = _mm_add_epi64(_mm_mul_epi32(vacc0x0123, vmultiplier), vrounding);
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
 
-    const __m128i vprod0x13 = _mm_add_epi64(_mm_mul_epi32(vacc0x1133, vmultiplier), vrounding);
-
-    const __m128i vq31prod0x02 = _mm_srli_epi64(vprod0x02, 31);
-    const __m128i vq31prod0x13 = _mm_add_epi64(vprod0x13, vprod0x13);
-
-    const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
-
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_mask);
-    const __m128i vrem0x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
-
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.shift);
-    vacc0x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
-
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
     __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
 
 
     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
 
-    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
-    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_max));
 
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
diff --git a/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-avx.c b/src/qs8-gemm/gen/1x4c2-xw-minmax-fp32-sse2.c
similarity index 67%
copy from src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-avx.c
copy to src/qs8-gemm/gen/1x4c2-xw-minmax-fp32-sse2.c
index b649211..0d3fe35 100644
--- a/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-avx.c
+++ b/src/qs8-gemm/gen/1x4c2-xw-minmax-fp32-sse2.c
@@ -9,14 +9,14 @@
 
 #include <assert.h>
 
-#include <smmintrin.h>
+#include <emmintrin.h>
 
 #include <xnnpack/gemm.h>
 #include <xnnpack/math.h>
 
 
 
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__avx(
+void xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse2(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -48,7 +48,7 @@
     size_t k = kc;
     while (k >= 8 * sizeof(int8_t)) {
       const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
-      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8);
       a0 += 8;
 
       const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
@@ -73,7 +73,7 @@
     }
     if (k != 0) {
       const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
-      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8);
       a0 = (const int8_t*) ((uintptr_t) a0 + k);
 
       const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
@@ -99,37 +99,22 @@
       }
     }
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.rounding);
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
 
-    const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
+    const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
 
-    const __m128i vprod0x02 = _mm_add_epi64(_mm_mul_epi32(vacc0x0123, vmultiplier), vrounding);
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
 
-    const __m128i vprod0x13 = _mm_add_epi64(_mm_mul_epi32(vacc0x1133, vmultiplier), vrounding);
-
-    const __m128i vq31prod0x02 = _mm_srli_epi64(vprod0x02, 31);
-    const __m128i vq31prod0x13 = _mm_add_epi64(vprod0x13, vprod0x13);
-
-    const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
-
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_mask);
-    const __m128i vrem0x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
-
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.shift);
-    vacc0x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
-
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
     __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
 
+    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
+    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->fp32_sse2.output_max);
+    vacc00x0123 = _mm_min_epi16(_mm_max_epi16(vacc00x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
 
-    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
-    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
 
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
@@ -146,7 +131,7 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+        *c0 = (int8_t) _mm_cvtsi128_si32(vout);
       }
 
       nc = 0;
diff --git a/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-avx.c b/src/qs8-gemm/gen/1x4c2-xw-minmax-fp32-sse41.c
similarity index 73%
rename from src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-avx.c
rename to src/qs8-gemm/gen/1x4c2-xw-minmax-fp32-sse41.c
index b649211..04b457d 100644
--- a/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-avx.c
+++ b/src/qs8-gemm/gen/1x4c2-xw-minmax-fp32-sse41.c
@@ -16,7 +16,7 @@
 
 
 
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__avx(
+void xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse41(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -99,37 +99,21 @@
       }
     }
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.rounding);
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
 
-    const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
+    const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
 
-    const __m128i vprod0x02 = _mm_add_epi64(_mm_mul_epi32(vacc0x0123, vmultiplier), vrounding);
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
 
-    const __m128i vprod0x13 = _mm_add_epi64(_mm_mul_epi32(vacc0x1133, vmultiplier), vrounding);
-
-    const __m128i vq31prod0x02 = _mm_srli_epi64(vprod0x02, 31);
-    const __m128i vq31prod0x13 = _mm_add_epi64(vprod0x13, vprod0x13);
-
-    const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
-
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_mask);
-    const __m128i vrem0x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
-
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.shift);
-    vacc0x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
-
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
     __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
 
 
     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
 
-    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
-    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_max));
 
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
diff --git a/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-ssse3.c b/src/qs8-gemm/gen/1x4c2-xw-minmax-fp32-ssse3.c
similarity index 100%
rename from src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-ssse3.c
rename to src/qs8-gemm/gen/1x4c2-xw-minmax-fp32-ssse3.c
diff --git a/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-xop.c b/src/qs8-gemm/gen/1x4c2-xw-minmax-fp32-xop.c
similarity index 73%
rename from src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-xop.c
rename to src/qs8-gemm/gen/1x4c2-xw-minmax-fp32-xop.c
index 534ee1b..fcde93f 100644
--- a/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-xop.c
+++ b/src/qs8-gemm/gen/1x4c2-xw-minmax-fp32-xop.c
@@ -21,7 +21,7 @@
 
 
 
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__xop(
+void xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__xop(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -104,37 +104,21 @@
       }
     }
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.rounding);
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
 
-    const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
+    const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
 
-    const __m128i vprod0x02 = _mm_add_epi64(_mm_mul_epi32(vacc0x0123, vmultiplier), vrounding);
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
 
-    const __m128i vprod0x13 = _mm_add_epi64(_mm_mul_epi32(vacc0x1133, vmultiplier), vrounding);
-
-    const __m128i vq31prod0x02 = _mm_srli_epi64(vprod0x02, 31);
-    const __m128i vq31prod0x13 = _mm_add_epi64(vprod0x13, vprod0x13);
-
-    const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
-
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_mask);
-    const __m128i vrem0x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
-
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.shift);
-    vacc0x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
-
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
     __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
 
 
     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
 
-    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
-    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_max));
 
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
diff --git a/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-sse2.c b/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-sse2.c
deleted file mode 100644
index 0f03fdf..0000000
--- a/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-sse2.c
+++ /dev/null
@@ -1,172 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/qs8-gemm/MRx4c2-sse.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <emmintrin.h>
-
-#include <xnnpack/gemm.h>
-#include <xnnpack/math.h>
-
-
-
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse2(
-    size_t mr,
-    size_t nc,
-    size_t kc,
-    const int8_t* restrict a,
-    size_t a_stride,
-    const void* restrict w,
-    int8_t* restrict c,
-    size_t cm_stride,
-    size_t cn_stride,
-    const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
-{
-  assert(mr != 0);
-  assert(mr <= 1);
-  assert(nc != 0);
-  assert(kc != 0);
-  assert(kc % sizeof(int8_t) == 0);
-  assert(a != NULL);
-  assert(w != NULL);
-  assert(c != NULL);
-
-  kc = round_up_po2(kc, 2);
-  const int8_t* a0 = a;
-  int8_t* c0 = c;
-
-  do {
-    __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
-    w = (const void*) ((const int32_t*) w + 4);
-
-    size_t k = kc;
-    while (k >= 8 * sizeof(int8_t)) {
-      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
-      const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8);
-      a0 += 8;
-
-      const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
-
-      vacc0x0123 = _mm_add_epi32(vacc0x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
-      const __m128i vxb1 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 8));
-
-      vacc0x0123 = _mm_add_epi32(vacc0x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
-      const __m128i vxb2 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 16));
-
-      vacc0x0123 = _mm_add_epi32(vacc0x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-      const __m128i vxb3 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 24));
-
-      vacc0x0123 = _mm_add_epi32(vacc0x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-
-      w = (const void*) ((const int16_t*) w + 32);
-      k -= 8 * sizeof(int8_t);
-    }
-    if (k != 0) {
-      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
-      const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8);
-      a0 = (const int8_t*) ((uintptr_t) a0 + k);
-
-      const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
-      w = (const void*) ((const int16_t*) w + 8);
-
-      vacc0x0123 = _mm_add_epi32(vacc0x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
-
-      if (k > 2 * sizeof(int8_t)) {
-        const __m128i vxb1 = _mm_load_si128((const __m128i*) w);
-        w = (const void*) ((const int16_t*) w + 8);
-
-        vacc0x0123 = _mm_add_epi32(vacc0x0123,
-          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
-
-        if (k > 4 * sizeof(int8_t)) {
-          const __m128i vxb2 = _mm_load_si128((const __m128i*) w);
-          w = (const void*) ((const int16_t*) w + 8);
-
-          vacc0x0123 = _mm_add_epi32(vacc0x0123,
-            _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-        }
-      }
-    }
-
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.rounding);
-
-    const __m128i vnmask0x0123 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc0x0123);
-
-    const __m128i vabsacc0x0123 = _mm_sub_epi32(_mm_xor_si128(vacc0x0123, vnmask0x0123), vnmask0x0123);
-
-    const __m128i vabsacc0x1133 = _mm_shuffle_epi32(vabsacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
-
-    const __m128i vabsprod0x02 = _mm_mul_epu32(vabsacc0x0123, vmultiplier);
-
-    const __m128i vnmask0x02 = _mm_shuffle_epi32(vnmask0x0123, _MM_SHUFFLE(2, 2, 0, 0));
-
-    const __m128i vprod0x02 = _mm_sub_epi64(_mm_xor_si128(vabsprod0x02, vnmask0x02), vnmask0x02);
-
-    const __m128i vq31prod0x02 = _mm_srli_epi64(_mm_add_epi64(vprod0x02, vrounding), 31);
-
-    const __m128i vabsprod0x13 = _mm_mul_epu32(vabsacc0x1133, vmultiplier);
-
-    const __m128i vnmask0x13 = _mm_shuffle_epi32(vnmask0x0123, _MM_SHUFFLE(3, 3, 1, 1));
-
-    const __m128i vprod0x13 = _mm_sub_epi64(_mm_xor_si128(vabsprod0x13, vnmask0x13), vnmask0x13);
-
-    const __m128i vq31prod0x13 = _mm_srli_epi64(_mm_add_epi64(vprod0x13, vrounding), 31);
-
-    const __m128i vq31prod0x0213 = _mm_castps_si128(_mm_shuffle_ps(
-        _mm_castsi128_ps(vq31prod0x02), _mm_castsi128_ps(vq31prod0x13), _MM_SHUFFLE(2, 0, 2, 0)));
-
-    const __m128i vq31prod0x0123 = _mm_shuffle_epi32(vq31prod0x0213, _MM_SHUFFLE(3, 1, 2, 0));
-
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.remainder_mask);
-    const __m128i vrem0x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
-
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.shift);
-    vacc0x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
-
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_zero_point);
-    __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
-
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_max);
-    vacc00x0123 = _mm_min_epi16(_mm_max_epi16(vacc00x0123, voutput_min), voutput_max);
-
-    __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
-
-
-    if (nc >= 4) {
-      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
-
-      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
-
-      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-
-      nc -= 4;
-    } else {
-      if (nc & 2) {
-        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
-        c0 += 2;
-        vout = _mm_srli_epi32(vout, 16);
-      }
-      if (nc & 1) {
-        *c0 = (int8_t) _mm_cvtsi128_si32(vout);
-      }
-
-      nc = 0;
-    }
-  } while (nc != 0);
-}
diff --git a/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-sse41.c b/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-sse41.c
deleted file mode 100644
index ed5fd68..0000000
--- a/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-sse41.c
+++ /dev/null
@@ -1,155 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/qs8-gemm/MRx4c2-sse.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <smmintrin.h>
-
-#include <xnnpack/gemm.h>
-#include <xnnpack/math.h>
-
-
-
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse41(
-    size_t mr,
-    size_t nc,
-    size_t kc,
-    const int8_t* restrict a,
-    size_t a_stride,
-    const void* restrict w,
-    int8_t* restrict c,
-    size_t cm_stride,
-    size_t cn_stride,
-    const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
-{
-  assert(mr != 0);
-  assert(mr <= 1);
-  assert(nc != 0);
-  assert(kc != 0);
-  assert(kc % sizeof(int8_t) == 0);
-  assert(a != NULL);
-  assert(w != NULL);
-  assert(c != NULL);
-
-  kc = round_up_po2(kc, 2);
-  const int8_t* a0 = a;
-  int8_t* c0 = c;
-
-  do {
-    __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
-    w = (const void*) ((const int32_t*) w + 4);
-
-    size_t k = kc;
-    while (k >= 8 * sizeof(int8_t)) {
-      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
-      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
-      a0 += 8;
-
-      const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
-
-      vacc0x0123 = _mm_add_epi32(vacc0x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
-      const __m128i vxb1 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 8));
-
-      vacc0x0123 = _mm_add_epi32(vacc0x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
-      const __m128i vxb2 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 16));
-
-      vacc0x0123 = _mm_add_epi32(vacc0x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-      const __m128i vxb3 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 24));
-
-      vacc0x0123 = _mm_add_epi32(vacc0x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-
-      w = (const void*) ((const int16_t*) w + 32);
-      k -= 8 * sizeof(int8_t);
-    }
-    if (k != 0) {
-      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
-      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
-      a0 = (const int8_t*) ((uintptr_t) a0 + k);
-
-      const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
-      w = (const void*) ((const int16_t*) w + 8);
-
-      vacc0x0123 = _mm_add_epi32(vacc0x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
-
-      if (k > 2 * sizeof(int8_t)) {
-        const __m128i vxb1 = _mm_load_si128((const __m128i*) w);
-        w = (const void*) ((const int16_t*) w + 8);
-
-        vacc0x0123 = _mm_add_epi32(vacc0x0123,
-          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
-
-        if (k > 4 * sizeof(int8_t)) {
-          const __m128i vxb2 = _mm_load_si128((const __m128i*) w);
-          w = (const void*) ((const int16_t*) w + 8);
-
-          vacc0x0123 = _mm_add_epi32(vacc0x0123,
-            _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-        }
-      }
-    }
-
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.rounding);
-
-    const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
-
-    const __m128i vprod0x02 = _mm_add_epi64(_mm_mul_epi32(vacc0x0123, vmultiplier), vrounding);
-
-    const __m128i vprod0x13 = _mm_add_epi64(_mm_mul_epi32(vacc0x1133, vmultiplier), vrounding);
-
-    const __m128i vq31prod0x02 = _mm_srli_epi64(vprod0x02, 31);
-    const __m128i vq31prod0x13 = _mm_add_epi64(vprod0x13, vprod0x13);
-
-    const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
-
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_mask);
-    const __m128i vrem0x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
-
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.shift);
-    vacc0x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
-
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_zero_point);
-    __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
-
-
-    __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
-
-    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
-    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
-
-    if (nc >= 4) {
-      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
-
-      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
-
-      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-
-      nc -= 4;
-    } else {
-      if (nc & 2) {
-        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
-        c0 += 2;
-        vout = _mm_srli_epi32(vout, 16);
-      }
-      if (nc & 1) {
-        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
-      }
-
-      nc = 0;
-    }
-  } while (nc != 0);
-}
diff --git a/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-sse41.c b/src/qs8-gemm/gen/1x4c8-xw-minmax-fp32-avx.c
similarity index 68%
copy from src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-sse41.c
copy to src/qs8-gemm/gen/1x4c8-xw-minmax-fp32-avx.c
index 8f5cae1..7b9e7f3 100644
--- a/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-sse41.c
+++ b/src/qs8-gemm/gen/1x4c8-xw-minmax-fp32-avx.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse41(
+void xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__avx(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -75,37 +75,21 @@
 
     __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.rounding);
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
 
-    const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
+    const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
 
-    const __m128i vprod0x02 = _mm_add_epi64(_mm_mul_epi32(vacc0x0123, vmultiplier), vrounding);
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
 
-    const __m128i vprod0x13 = _mm_add_epi64(_mm_mul_epi32(vacc0x1133, vmultiplier), vrounding);
-
-    const __m128i vq31prod0x02 = _mm_srli_epi64(vprod0x02, 31);
-    const __m128i vq31prod0x13 = _mm_add_epi64(vprod0x13, vprod0x13);
-
-    const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
-
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_mask);
-    const __m128i vrem0x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
-
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.shift);
-    vacc0x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
-
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
     __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
 
 
     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
 
-    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
-    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_max));
 
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
diff --git a/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-sse2.c b/src/qs8-gemm/gen/1x4c8-xw-minmax-fp32-sse2.c
similarity index 60%
rename from src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-sse2.c
rename to src/qs8-gemm/gen/1x4c8-xw-minmax-fp32-sse2.c
index a32f77c..593d736 100644
--- a/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-sse2.c
+++ b/src/qs8-gemm/gen/1x4c8-xw-minmax-fp32-sse2.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse2(
+void xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse2(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -75,50 +75,18 @@
 
     __m128i vacc0x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x02, vacc0x13), _mm_unpackhi_epi32(vacc0x02, vacc0x13));
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.rounding);
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
 
-    const __m128i vnmask0x0123 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc0x0123);
+    const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
 
-    const __m128i vabsacc0x0123 = _mm_sub_epi32(_mm_xor_si128(vacc0x0123, vnmask0x0123), vnmask0x0123);
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
 
-    const __m128i vabsacc0x1133 = _mm_shuffle_epi32(vabsacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
-
-    const __m128i vabsprod0x02 = _mm_mul_epu32(vabsacc0x0123, vmultiplier);
-
-    const __m128i vnmask0x02 = _mm_shuffle_epi32(vnmask0x0123, _MM_SHUFFLE(2, 2, 0, 0));
-
-    const __m128i vprod0x02 = _mm_sub_epi64(_mm_xor_si128(vabsprod0x02, vnmask0x02), vnmask0x02);
-
-    const __m128i vq31prod0x02 = _mm_srli_epi64(_mm_add_epi64(vprod0x02, vrounding), 31);
-
-    const __m128i vabsprod0x13 = _mm_mul_epu32(vabsacc0x1133, vmultiplier);
-
-    const __m128i vnmask0x13 = _mm_shuffle_epi32(vnmask0x0123, _MM_SHUFFLE(3, 3, 1, 1));
-
-    const __m128i vprod0x13 = _mm_sub_epi64(_mm_xor_si128(vabsprod0x13, vnmask0x13), vnmask0x13);
-
-    const __m128i vq31prod0x13 = _mm_srli_epi64(_mm_add_epi64(vprod0x13, vrounding), 31);
-
-    const __m128i vq31prod0x0213 = _mm_castps_si128(_mm_shuffle_ps(
-        _mm_castsi128_ps(vq31prod0x02), _mm_castsi128_ps(vq31prod0x13), _MM_SHUFFLE(2, 0, 2, 0)));
-
-    const __m128i vq31prod0x0123 = _mm_shuffle_epi32(vq31prod0x0213, _MM_SHUFFLE(3, 1, 2, 0));
-
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.remainder_mask);
-    const __m128i vrem0x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
-
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.shift);
-    vacc0x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
-
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
     __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
 
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_max);
+    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
+    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->fp32_sse2.output_max);
     vacc00x0123 = _mm_min_epi16(_mm_max_epi16(vacc00x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
diff --git a/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-sse41.c b/src/qs8-gemm/gen/1x4c8-xw-minmax-fp32-sse41.c
similarity index 68%
rename from src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-sse41.c
rename to src/qs8-gemm/gen/1x4c8-xw-minmax-fp32-sse41.c
index 8f5cae1..4abd723 100644
--- a/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-sse41.c
+++ b/src/qs8-gemm/gen/1x4c8-xw-minmax-fp32-sse41.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse41(
+void xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse41(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -75,37 +75,21 @@
 
     __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.rounding);
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
 
-    const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
+    const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
 
-    const __m128i vprod0x02 = _mm_add_epi64(_mm_mul_epi32(vacc0x0123, vmultiplier), vrounding);
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
 
-    const __m128i vprod0x13 = _mm_add_epi64(_mm_mul_epi32(vacc0x1133, vmultiplier), vrounding);
-
-    const __m128i vq31prod0x02 = _mm_srli_epi64(vprod0x02, 31);
-    const __m128i vq31prod0x13 = _mm_add_epi64(vprod0x13, vprod0x13);
-
-    const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
-
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_mask);
-    const __m128i vrem0x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
-
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.shift);
-    vacc0x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
-
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
     __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
 
 
     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
 
-    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
-    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_max));
 
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
diff --git a/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-sse41.c b/src/qs8-gemm/gen/1x4c8-xw-minmax-fp32-ssse3.c
similarity index 62%
copy from src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-sse41.c
copy to src/qs8-gemm/gen/1x4c8-xw-minmax-fp32-ssse3.c
index 8f5cae1..e2d6c95 100644
--- a/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-sse41.c
+++ b/src/qs8-gemm/gen/1x4c8-xw-minmax-fp32-ssse3.c
@@ -9,13 +9,13 @@
 
 #include <assert.h>
 
-#include <smmintrin.h>
+#include <tmmintrin.h>
 
 #include <xnnpack/gemm.h>
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse41(
+void xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__ssse3(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -50,7 +50,7 @@
     size_t k = 0;
     while (k < kc) {
       const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
-      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8);
       a0 += 8;
 
       const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
@@ -75,37 +75,22 @@
 
     __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.rounding);
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
 
-    const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
+    const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
 
-    const __m128i vprod0x02 = _mm_add_epi64(_mm_mul_epi32(vacc0x0123, vmultiplier), vrounding);
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
 
-    const __m128i vprod0x13 = _mm_add_epi64(_mm_mul_epi32(vacc0x1133, vmultiplier), vrounding);
-
-    const __m128i vq31prod0x02 = _mm_srli_epi64(vprod0x02, 31);
-    const __m128i vq31prod0x13 = _mm_add_epi64(vprod0x13, vprod0x13);
-
-    const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
-
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_mask);
-    const __m128i vrem0x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
-
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.shift);
-    vacc0x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
-
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
     __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
 
+    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
+    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->fp32_sse2.output_max);
+    vacc00x0123 = _mm_min_epi16(_mm_max_epi16(vacc00x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
 
-    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
-    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
 
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
@@ -122,7 +107,7 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+        *c0 = (int8_t) _mm_cvtsi128_si32(vout);
       }
 
       nc = 0;
diff --git a/src/qs8-gemm/gen/1x4c8-xw-minmax-fp32-wasmsimd.c b/src/qs8-gemm/gen/1x4c8-xw-minmax-fp32-wasmsimd.c
new file mode 100644
index 0000000..d1bda6a
--- /dev/null
+++ b/src/qs8-gemm/gen/1x4c8-xw-minmax-fp32-wasmsimd.c
@@ -0,0 +1,138 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/MRx4c8-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__wasmsimd(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 1);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+
+  const v128_t vzero = wasm_f64x2_splat(0.0);
+  do {
+    v128_t vacc0x0 = wasm_f32x4_replace_lane(vzero, 0, ((const float*) w)[0]);
+    v128_t vacc0x1 = wasm_f32x4_replace_lane(vzero, 0, ((const float*) w)[1]);
+    v128_t vacc0x2 = wasm_f32x4_replace_lane(vzero, 0, ((const float*) w)[2]);
+    v128_t vacc0x3 = wasm_f32x4_replace_lane(vzero, 0, ((const float*) w)[3]);
+    w = (const void*) ((const int32_t*) w + 4);
+
+    size_t k = 0;
+    while (k < kc) {
+      const v128_t vxa0 = wasm_i16x8_load8x8(a0);
+      a0 += 8;
+
+      const v128_t vxb0 = wasm_v128_load(w);
+
+      const v128_t vprod0x0 = wasm_i16x8_mul(vxa0, vxb0);
+      vacc0x0 = wasm_i32x4_add(vacc0x0, wasm_i32x4_extend_low_i16x8(vprod0x0));
+      vacc0x0 = wasm_i32x4_add(vacc0x0, wasm_i32x4_extend_high_i16x8(vprod0x0));
+      const v128_t vxb1 = wasm_v128_load((const int16_t*) w + 8);
+
+      const v128_t vprod0x1 = wasm_i16x8_mul(vxa0, vxb1);
+      vacc0x1 = wasm_i32x4_add(vacc0x1, wasm_i32x4_extend_low_i16x8(vprod0x1));
+      vacc0x1 = wasm_i32x4_add(vacc0x1, wasm_i32x4_extend_high_i16x8(vprod0x1));
+      const v128_t vxb2 = wasm_v128_load((const int16_t*) w + 16);
+
+      const v128_t vprod0x2 = wasm_i16x8_mul(vxa0, vxb2);
+      vacc0x2 = wasm_i32x4_add(vacc0x2, wasm_i32x4_extend_low_i16x8(vprod0x2));
+      vacc0x2 = wasm_i32x4_add(vacc0x2, wasm_i32x4_extend_high_i16x8(vprod0x2));
+      const v128_t vxb3 = wasm_v128_load((const int16_t*) w + 24);
+
+      const v128_t vprod0x3 = wasm_i16x8_mul(vxa0, vxb3);
+      vacc0x3 = wasm_i32x4_add(vacc0x3, wasm_i32x4_extend_low_i16x8(vprod0x3));
+      vacc0x3 = wasm_i32x4_add(vacc0x3, wasm_i32x4_extend_high_i16x8(vprod0x3));
+
+      w = (const void*) ((const int16_t*) w + 32);
+      k += 8 * sizeof(int8_t);
+    }
+
+    const v128_t vacc0x02 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x0, vacc0x2, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x0, vacc0x2, 2, 6, 3, 7));
+    const v128_t vacc0x13 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x1, vacc0x3, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x1, vacc0x3, 2, 6, 3, 7));
+
+    v128_t vacc0x0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x02, vacc0x13, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x02, vacc0x13, 2, 6, 3, 7));
+
+    const v128_t vsign0x0123 = wasm_i32x4_shr(vacc0x0123, 31);
+
+    const v128_t vacc0x01 = wasm_v32x4_shuffle(vacc0x0123, vsign0x0123, 0, 4, 1, 5);
+
+    const v128_t vmultiplier = wasm_v128_load(params->gemmlowp_wasmsimd.multiplier);
+    const v128_t vrounding = wasm_v128_load(params->gemmlowp_wasmsimd.rounding);
+    const v128_t vprod0x01 = wasm_i64x2_add(wasm_i64x2_mul(vacc0x01, vmultiplier), vrounding);
+    const v128_t vacc0x23 = wasm_v32x4_shuffle(vacc0x0123, vsign0x0123, 2, 6, 3, 7);
+
+    const v128_t vprod0x23 = wasm_i64x2_add(wasm_i64x2_mul(vacc0x23, vmultiplier), vrounding);
+
+    const v128_t vq31prod0x0123 = wasm_v32x4_shuffle(vprod0x01, vprod0x23, 1, 3, 5, 7);
+
+    const v128_t vremainder_mask = wasm_v128_load(params->gemmlowp_wasmsimd.remainder_mask);
+    const v128_t vrem0x0123 = wasm_i32x4_add(wasm_v128_and(vq31prod0x0123, vremainder_mask), wasm_i32x4_shr(vq31prod0x0123, 31));
+
+    const v128_t vthreshold = wasm_v128_load(params->gemmlowp_wasmsimd.remainder_threshold);
+    const int32_t vshift = params->gemmlowp_wasmsimd.shift;
+    vacc0x0123 = wasm_i32x4_sub(wasm_i32x4_shr(vq31prod0x0123, vshift), wasm_i32x4_gt(vrem0x0123, vthreshold));
+
+    const v128_t voutput_zero_point = wasm_v128_load(params->gemmlowp_wasmsimd.output_zero_point);
+    v128_t vacc00x0123 = wasm_i16x8_add_sat(wasm_i16x8_narrow_i32x4(vacc0x0123, vacc0x0123), voutput_zero_point);
+
+    v128_t vout = wasm_i8x16_narrow_i16x8(vacc00x0123, vacc00x0123);
+
+    const v128_t voutput_min = wasm_v128_load(params->gemmlowp_wasmsimd.output_min);
+    vout = wasm_i8x16_max(vout, voutput_min);
+
+    const v128_t voutput_max = wasm_v128_load(params->gemmlowp_wasmsimd.output_max);
+    vout = wasm_i8x16_min(vout, voutput_max);
+
+    if (nc >= 4) {
+      *((float*) c0) = (float) wasm_f32x4_extract_lane(vout, 0);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) wasm_i16x8_extract_lane(vout, 0);
+        c0 += 2;
+        vout = wasm_u32x4_shr(vout, 16);
+      }
+      if (nc & 1) {
+        *c0 = (int8_t) wasm_i8x16_extract_lane(vout, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-xop.c b/src/qs8-gemm/gen/1x4c8-xw-minmax-fp32-xop.c
similarity index 68%
rename from src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-xop.c
rename to src/qs8-gemm/gen/1x4c8-xw-minmax-fp32-xop.c
index c85e7af..5bce735 100644
--- a/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-xop.c
+++ b/src/qs8-gemm/gen/1x4c8-xw-minmax-fp32-xop.c
@@ -20,7 +20,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__xop(
+void xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__xop(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -80,37 +80,21 @@
 
     __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.rounding);
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
 
-    const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
+    const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
 
-    const __m128i vprod0x02 = _mm_add_epi64(_mm_mul_epi32(vacc0x0123, vmultiplier), vrounding);
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
 
-    const __m128i vprod0x13 = _mm_add_epi64(_mm_mul_epi32(vacc0x1133, vmultiplier), vrounding);
-
-    const __m128i vq31prod0x02 = _mm_srli_epi64(vprod0x02, 31);
-    const __m128i vq31prod0x13 = _mm_add_epi64(vprod0x13, vprod0x13);
-
-    const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
-
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_mask);
-    const __m128i vrem0x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
-
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.shift);
-    vacc0x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
-
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
     __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
 
 
     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
 
-    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
-    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_max));
 
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
diff --git a/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-avx.c b/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-avx.c
deleted file mode 100644
index 27ac294..0000000
--- a/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-avx.c
+++ /dev/null
@@ -1,131 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/qs8-gemm/MRx4c8-sse.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <smmintrin.h>
-
-#include <xnnpack/gemm.h>
-#include <xnnpack/math.h>
-
-
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__avx(
-    size_t mr,
-    size_t nc,
-    size_t kc,
-    const int8_t* restrict a,
-    size_t a_stride,
-    const void* restrict w,
-    int8_t* restrict c,
-    size_t cm_stride,
-    size_t cn_stride,
-    const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
-{
-  assert(mr != 0);
-  assert(mr <= 1);
-  assert(nc != 0);
-  assert(kc != 0);
-  assert(kc % sizeof(int8_t) == 0);
-  assert(a != NULL);
-  assert(w != NULL);
-  assert(c != NULL);
-
-  kc = round_up_po2(kc, 8);
-  const int8_t* a0 = a;
-  int8_t* c0 = c;
-
-  do {
-    __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
-    __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
-    __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
-    __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
-    w = (const void*) ((const int32_t*) w + 4);
-
-    size_t k = 0;
-    while (k < kc) {
-      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
-      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
-      a0 += 8;
-
-      const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
-
-      vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
-      const __m128i vxb1 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 8));
-
-      vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
-      const __m128i vxb2 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 16));
-
-      vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
-      const __m128i vxb3 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 24));
-
-      vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
-
-      w = (const void*) ((const int16_t*) w + 32);
-      k += 8 * sizeof(int8_t);
-    }
-
-    const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
-    const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
-
-    __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
-
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.rounding);
-
-    const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
-
-    const __m128i vprod0x02 = _mm_add_epi64(_mm_mul_epi32(vacc0x0123, vmultiplier), vrounding);
-
-    const __m128i vprod0x13 = _mm_add_epi64(_mm_mul_epi32(vacc0x1133, vmultiplier), vrounding);
-
-    const __m128i vq31prod0x02 = _mm_srli_epi64(vprod0x02, 31);
-    const __m128i vq31prod0x13 = _mm_add_epi64(vprod0x13, vprod0x13);
-
-    const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
-
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_mask);
-    const __m128i vrem0x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
-
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.shift);
-    vacc0x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
-
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_zero_point);
-    __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
-
-
-    __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
-
-    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
-    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
-
-    if (nc >= 4) {
-      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
-
-      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
-
-      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-
-      nc -= 4;
-    } else {
-      if (nc & 2) {
-        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
-        c0 += 2;
-        vout = _mm_srli_epi32(vout, 16);
-      }
-      if (nc & 1) {
-        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
-      }
-
-      nc = 0;
-    }
-  } while (nc != 0);
-}
diff --git a/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-ssse3.c b/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-ssse3.c
deleted file mode 100644
index 0be1783..0000000
--- a/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-ssse3.c
+++ /dev/null
@@ -1,148 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/qs8-gemm/MRx4c8-sse.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <tmmintrin.h>
-
-#include <xnnpack/gemm.h>
-#include <xnnpack/math.h>
-
-
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__ssse3(
-    size_t mr,
-    size_t nc,
-    size_t kc,
-    const int8_t* restrict a,
-    size_t a_stride,
-    const void* restrict w,
-    int8_t* restrict c,
-    size_t cm_stride,
-    size_t cn_stride,
-    const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
-{
-  assert(mr != 0);
-  assert(mr <= 1);
-  assert(nc != 0);
-  assert(kc != 0);
-  assert(kc % sizeof(int8_t) == 0);
-  assert(a != NULL);
-  assert(w != NULL);
-  assert(c != NULL);
-
-  kc = round_up_po2(kc, 8);
-  const int8_t* a0 = a;
-  int8_t* c0 = c;
-
-  do {
-    __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
-    __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
-    __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
-    __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
-    w = (const void*) ((const int32_t*) w + 4);
-
-    size_t k = 0;
-    while (k < kc) {
-      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
-      const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8);
-      a0 += 8;
-
-      const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
-
-      vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
-      const __m128i vxb1 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 8));
-
-      vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
-      const __m128i vxb2 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 16));
-
-      vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
-      const __m128i vxb3 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 24));
-
-      vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
-
-      w = (const void*) ((const int16_t*) w + 32);
-      k += 8 * sizeof(int8_t);
-    }
-
-    const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
-    const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
-
-    __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
-
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.rounding);
-
-    const __m128i vnmask0x0123 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc0x0123);
-
-    const __m128i vabsacc0x0123 = _mm_abs_epi32(vacc0x0123);
-
-    const __m128i vabsacc0x1133 = _mm_shuffle_epi32(vabsacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
-
-    const __m128i vabsprod0x02 = _mm_mul_epu32(vabsacc0x0123, vmultiplier);
-
-    const __m128i vnmask0x02 = _mm_shuffle_epi32(vnmask0x0123, _MM_SHUFFLE(2, 2, 0, 0));
-
-    const __m128i vprod0x02 = _mm_sub_epi64(_mm_xor_si128(vabsprod0x02, vnmask0x02), vnmask0x02);
-
-    const __m128i vq31prod0x02 = _mm_srli_epi64(_mm_add_epi64(vprod0x02, vrounding), 31);
-
-    const __m128i vabsprod0x13 = _mm_mul_epu32(vabsacc0x1133, vmultiplier);
-
-    const __m128i vnmask0x13 = _mm_shuffle_epi32(vnmask0x0123, _MM_SHUFFLE(3, 3, 1, 1));
-
-    const __m128i vprod0x13 = _mm_sub_epi64(_mm_xor_si128(vabsprod0x13, vnmask0x13), vnmask0x13);
-
-    const __m128i vq31prod0x13 = _mm_srli_epi64(_mm_add_epi64(vprod0x13, vrounding), 31);
-
-    const __m128i vq31prod0x0213 = _mm_castps_si128(_mm_shuffle_ps(
-        _mm_castsi128_ps(vq31prod0x02), _mm_castsi128_ps(vq31prod0x13), _MM_SHUFFLE(2, 0, 2, 0)));
-
-    const __m128i vq31prod0x0123 = _mm_shuffle_epi32(vq31prod0x0213, _MM_SHUFFLE(3, 1, 2, 0));
-
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.remainder_mask);
-    const __m128i vrem0x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
-
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.shift);
-    vacc0x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
-
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_zero_point);
-    __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
-
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_max);
-    vacc00x0123 = _mm_min_epi16(_mm_max_epi16(vacc00x0123, voutput_min), voutput_max);
-
-    __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
-
-
-    if (nc >= 4) {
-      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
-
-      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
-
-      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-
-      nc -= 4;
-    } else {
-      if (nc & 2) {
-        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
-        c0 += 2;
-        vout = _mm_srli_epi32(vout, 16);
-      }
-      if (nc & 1) {
-        *c0 = (int8_t) _mm_cvtsi128_si32(vout);
-      }
-
-      nc = 0;
-    }
-  } while (nc != 0);
-}
diff --git a/src/qs8-gemm/gen/1x8c8-xw-minmax-gemmlowp-avx2.c b/src/qs8-gemm/gen/1x8c8-xw-minmax-gemmlowp-avx2.c
deleted file mode 100644
index 8b989f5..0000000
--- a/src/qs8-gemm/gen/1x8c8-xw-minmax-gemmlowp-avx2.c
+++ /dev/null
@@ -1,158 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/qs8-gemm/MRx8c8-avx2.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <immintrin.h>
-
-#include <xnnpack/gemm.h>
-#include <xnnpack/intrinsics-polyfill.h>
-#include <xnnpack/math.h>
-
-
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x8c8__avx2(
-    size_t mr,
-    size_t nc,
-    size_t kc,
-    const int8_t* restrict a,
-    size_t a_stride,
-    const void* restrict w,
-    int8_t* restrict c,
-    size_t cm_stride,
-    size_t cn_stride,
-    const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
-{
-  assert(mr != 0);
-  assert(mr <= 1);
-  assert(nc != 0);
-  assert(kc != 0);
-  assert(kc % sizeof(int8_t) == 0);
-  assert(a != NULL);
-  assert(w != NULL);
-  assert(c != NULL);
-
-  kc = round_up_po2(kc, 8);
-  const int8_t* a0 = a;
-  int8_t* c0 = c;
-
-  do {
-    const __m128i vbias0x0 = _mm_loadu_si32(w);
-    const __m128i vbias0x1 = _mm_loadu_si32((const int32_t*) w + 1);
-    __m256i vacc0x01 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x0), vbias0x1, 1);
-    const __m128i vbias0x2 = _mm_loadu_si32((const int32_t*) w + 2);
-    const __m128i vbias0x3 = _mm_loadu_si32((const int32_t*) w + 3);
-    __m256i vacc0x23 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x2), vbias0x3, 1);
-    const __m128i vbias0x4 = _mm_loadu_si32((const int32_t*) w + 4);
-    const __m128i vbias0x5 = _mm_loadu_si32((const int32_t*) w + 5);
-    __m256i vacc0x45 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x4), vbias0x5, 1);
-    const __m128i vbias0x6 = _mm_loadu_si32((const int32_t*) w + 6);
-    const __m128i vbias0x7 = _mm_loadu_si32((const int32_t*) w + 7);
-    __m256i vacc0x67 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x6), vbias0x7, 1);
-    w = (const void*) ((const int32_t*) w + 8);
-
-    size_t k = 0;
-    while (k < kc) {
-      const __m128i va0 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a0));
-      const __m256i vxa0 = _mm256_cvtepi8_epi16(va0);
-      a0 += 8;
-
-      const __m256i vxb01 = _mm256_load_si256((const __m256i*) w);
-
-      vacc0x01 = _mm256_add_epi32(vacc0x01, _mm256_madd_epi16(vxa0, vxb01));
-      const __m256i vxb23 = _mm256_load_si256((const __m256i*) ((const int16_t*) w + 16));
-
-      vacc0x23 = _mm256_add_epi32(vacc0x23, _mm256_madd_epi16(vxa0, vxb23));
-      const __m256i vxb45 = _mm256_load_si256((const __m256i*) ((const int16_t*) w + 32));
-
-      vacc0x45 = _mm256_add_epi32(vacc0x45, _mm256_madd_epi16(vxa0, vxb45));
-      const __m256i vxb67 = _mm256_load_si256((const __m256i*) ((const int16_t*) w + 48));
-
-      vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67));
-
-      w = (const void*) ((const int16_t*) w + 64);
-      k += 8 * sizeof(int8_t);
-    }
-
-    const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23);
-    const __m256i vacc0x4657 = _mm256_hadd_epi32(vacc0x45, vacc0x67);
-
-    const __m256i vacc0x02461357 = _mm256_hadd_epi32(vacc0x0213, vacc0x4657);
-
-    const __m256i vpermute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
-    __m256i vacc0x01234567 = _mm256_permutevar8x32_epi32(vacc0x02461357, vpermute_mask);
-
-    const __m256i vmultiplier = _mm256_load_si256((const __m256i*) params->gemmlowp_avx2.multiplier);
-    const __m256i vrounding = _mm256_load_si256((const __m256i*) params->gemmlowp_avx2.rounding);
-
-    const __m256i vacc0x11335577 = _mm256_srli_epi64(vacc0x01234567, 32);
-
-    const __m256i vprod0x0246 = _mm256_add_epi64(_mm256_mul_epi32(vacc0x01234567, vmultiplier), vrounding);
-
-    const __m256i vprod0x1357 = _mm256_add_epi64(_mm256_mul_epi32(vacc0x11335577, vmultiplier), vrounding);
-
-    const __m256i vq31prod0x0246 = _mm256_srli_epi64(vprod0x0246, 31);
-    const __m256i vq31prod0x1357 = _mm256_add_epi64(vprod0x1357, vprod0x1357);
-
-    const __m256i vq31prod0x01234567 = _mm256_blend_epi16(vq31prod0x0246, vq31prod0x1357, 0xCC);
-
-    const __m256i vremainder_mask = _mm256_load_si256((const __m256i*) params->gemmlowp_avx2.remainder_mask);
-    const __m256i vrem0x01234567 =
-      _mm256_add_epi32(_mm256_and_si256(vq31prod0x01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod0x01234567));
-
-    const __m256i vremainder_threshold = _mm256_load_si256((const __m256i*) params->gemmlowp_avx2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->gemmlowp_avx2.shift);
-    vacc0x01234567 =
-      _mm256_sub_epi32(_mm256_sra_epi32(vq31prod0x01234567, vshift), _mm256_cmpgt_epi32(vrem0x01234567, vremainder_threshold));
-
-    const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->gemmlowp_avx2.output_zero_point);
-    __m256i vacc00x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc0x01234567, vacc0x01234567), voutput_zero_point);
-
-    vacc00x01234567 = _mm256_permute4x64_epi64(vacc00x01234567, _MM_SHUFFLE(3, 1, 2, 0));
-
-    __m256i vout = _mm256_packs_epi16(vacc00x01234567, vacc00x01234567);
-
-    vout = _mm256_max_epi8(vout, _mm256_load_si256((const __m256i*) params->gemmlowp_avx2.output_min));
-    vout = _mm256_min_epi8(vout, _mm256_load_si256((const __m256i*) params->gemmlowp_avx2.output_max));
-
-    __m128i vout_lo = _mm256_castsi256_si128(vout);
-    __m128i vout_hi = _mm256_extracti128_si256(vout, 1);
-
-    if (nc >= 8) {
-      _mm_storel_epi64((__m128i*) c0, vout_lo);
-
-      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
-
-      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-
-      nc -= 8;
-    } else {
-      if (nc & 4) {
-        _mm_storeu_si32(c0, vout_lo);
-
-        c0 += 4;
-
-        vout_lo = _mm_srli_epi64(vout_lo, 32);
-        vout_hi = _mm_srli_epi64(vout_hi, 32);
-      }
-      if (nc & 2) {
-        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout_lo, 0);
-
-        c0 += 2;
-
-        vout_lo = _mm_srli_epi32(vout_lo, 16);
-        vout_hi = _mm_srli_epi32(vout_hi, 16);
-      }
-      if (nc & 1) {
-        *c0 = (int8_t) _mm_extract_epi8(vout_lo, 0);
-      }
-
-      nc = 0;
-    }
-  } while (nc != 0);
-}
diff --git a/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-sse41.c b/src/qs8-gemm/gen/2x4c2-xw-minmax-fp32-avx.c
similarity index 71%
copy from src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-sse41.c
copy to src/qs8-gemm/gen/2x4c2-xw-minmax-fp32-avx.c
index 3520996..989b12e 100644
--- a/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-sse41.c
+++ b/src/qs8-gemm/gen/2x4c2-xw-minmax-fp32-avx.c
@@ -16,7 +16,7 @@
 
 
 
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c2__sse41(
+void xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__avx(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -126,47 +126,24 @@
       }
     }
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.rounding);
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
 
-    const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
-    const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
+    const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale);
 
-    const __m128i vprod0x02 = _mm_add_epi64(_mm_mul_epi32(vacc0x0123, vmultiplier), vrounding);
-    const __m128i vprod1x02 = _mm_add_epi64(_mm_mul_epi32(vacc1x0123, vmultiplier), vrounding);
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
 
-    const __m128i vprod0x13 = _mm_add_epi64(_mm_mul_epi32(vacc0x1133, vmultiplier), vrounding);
-    const __m128i vprod1x13 = _mm_add_epi64(_mm_mul_epi32(vacc1x1133, vmultiplier), vrounding);
-
-    const __m128i vq31prod0x02 = _mm_srli_epi64(vprod0x02, 31);
-    const __m128i vq31prod0x13 = _mm_add_epi64(vprod0x13, vprod0x13);
-    const __m128i vq31prod1x02 = _mm_srli_epi64(vprod1x02, 31);
-    const __m128i vq31prod1x13 = _mm_add_epi64(vprod1x13, vprod1x13);
-
-    const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
-    const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
-
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_mask);
-    const __m128i vrem0x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
-    const __m128i vrem1x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
-
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.shift);
-    vacc0x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
-    vacc1x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
-
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
 
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
 
-    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
-    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_max));
 
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
diff --git a/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-sse41.c b/src/qs8-gemm/gen/2x4c2-xw-minmax-fp32-sse2.c
similarity index 65%
copy from src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-sse41.c
copy to src/qs8-gemm/gen/2x4c2-xw-minmax-fp32-sse2.c
index 3520996..bcc6878 100644
--- a/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-sse41.c
+++ b/src/qs8-gemm/gen/2x4c2-xw-minmax-fp32-sse2.c
@@ -9,14 +9,14 @@
 
 #include <assert.h>
 
-#include <smmintrin.h>
+#include <emmintrin.h>
 
 #include <xnnpack/gemm.h>
 #include <xnnpack/math.h>
 
 
 
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c2__sse41(
+void xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse2(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -55,10 +55,10 @@
     size_t k = kc;
     while (k >= 8 * sizeof(int8_t)) {
       const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
-      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8);
       a0 += 8;
       const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
-      const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+      const __m128i vxa1 = _mm_srai_epi16(_mm_unpacklo_epi8(va1, va1), 8);
       a1 += 8;
 
       const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
@@ -91,10 +91,10 @@
     }
     if (k != 0) {
       const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
-      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8);
       a0 = (const int8_t*) ((uintptr_t) a0 + k);
       const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
-      const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+      const __m128i vxa1 = _mm_srai_epi16(_mm_unpacklo_epi8(va1, va1), 8);
       a1 = (const int8_t*) ((uintptr_t) a1 + k);
 
       const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
@@ -126,51 +126,30 @@
       }
     }
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.rounding);
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
 
-    const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
-    const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
+    const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale);
 
-    const __m128i vprod0x02 = _mm_add_epi64(_mm_mul_epi32(vacc0x0123, vmultiplier), vrounding);
-    const __m128i vprod1x02 = _mm_add_epi64(_mm_mul_epi32(vacc1x0123, vmultiplier), vrounding);
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
 
-    const __m128i vprod0x13 = _mm_add_epi64(_mm_mul_epi32(vacc0x1133, vmultiplier), vrounding);
-    const __m128i vprod1x13 = _mm_add_epi64(_mm_mul_epi32(vacc1x1133, vmultiplier), vrounding);
-
-    const __m128i vq31prod0x02 = _mm_srli_epi64(vprod0x02, 31);
-    const __m128i vq31prod0x13 = _mm_add_epi64(vprod0x13, vprod0x13);
-    const __m128i vq31prod1x02 = _mm_srli_epi64(vprod1x02, 31);
-    const __m128i vq31prod1x13 = _mm_add_epi64(vprod1x13, vprod1x13);
-
-    const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
-    const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
-
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_mask);
-    const __m128i vrem0x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
-    const __m128i vrem1x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
-
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.shift);
-    vacc0x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
-    vacc1x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
-
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
 
+    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
+    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->fp32_sse2.output_max);
+    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
 
-    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
-    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
 
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
-      *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
+      vout = _mm_srli_si128(vout, 4);
+      *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(vout);
 
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
@@ -188,8 +167,8 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
-        *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+        *c0 = (int8_t) _mm_cvtsi128_si32(vout);
+        *c1 = (int8_t) _mm_extract_epi16(vout, 2);
       }
 
       nc = 0;
diff --git a/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-sse41.c b/src/qs8-gemm/gen/2x4c2-xw-minmax-fp32-sse41.c
similarity index 71%
rename from src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-sse41.c
rename to src/qs8-gemm/gen/2x4c2-xw-minmax-fp32-sse41.c
index 3520996..41ea918 100644
--- a/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-sse41.c
+++ b/src/qs8-gemm/gen/2x4c2-xw-minmax-fp32-sse41.c
@@ -16,7 +16,7 @@
 
 
 
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c2__sse41(
+void xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse41(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -126,47 +126,24 @@
       }
     }
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.rounding);
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
 
-    const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
-    const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
+    const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale);
 
-    const __m128i vprod0x02 = _mm_add_epi64(_mm_mul_epi32(vacc0x0123, vmultiplier), vrounding);
-    const __m128i vprod1x02 = _mm_add_epi64(_mm_mul_epi32(vacc1x0123, vmultiplier), vrounding);
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
 
-    const __m128i vprod0x13 = _mm_add_epi64(_mm_mul_epi32(vacc0x1133, vmultiplier), vrounding);
-    const __m128i vprod1x13 = _mm_add_epi64(_mm_mul_epi32(vacc1x1133, vmultiplier), vrounding);
-
-    const __m128i vq31prod0x02 = _mm_srli_epi64(vprod0x02, 31);
-    const __m128i vq31prod0x13 = _mm_add_epi64(vprod0x13, vprod0x13);
-    const __m128i vq31prod1x02 = _mm_srli_epi64(vprod1x02, 31);
-    const __m128i vq31prod1x13 = _mm_add_epi64(vprod1x13, vprod1x13);
-
-    const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
-    const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
-
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_mask);
-    const __m128i vrem0x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
-    const __m128i vrem1x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
-
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.shift);
-    vacc0x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
-    vacc1x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
-
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
 
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
 
-    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
-    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_max));
 
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
diff --git a/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-ssse3.c b/src/qs8-gemm/gen/2x4c2-xw-minmax-fp32-ssse3.c
similarity index 100%
rename from src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-ssse3.c
rename to src/qs8-gemm/gen/2x4c2-xw-minmax-fp32-ssse3.c
diff --git a/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-xop.c b/src/qs8-gemm/gen/2x4c2-xw-minmax-fp32-xop.c
similarity index 71%
rename from src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-xop.c
rename to src/qs8-gemm/gen/2x4c2-xw-minmax-fp32-xop.c
index 6359db1..338a372 100644
--- a/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-xop.c
+++ b/src/qs8-gemm/gen/2x4c2-xw-minmax-fp32-xop.c
@@ -21,7 +21,7 @@
 
 
 
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c2__xop(
+void xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__xop(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -131,47 +131,24 @@
       }
     }
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.rounding);
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
 
-    const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
-    const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
+    const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale);
 
-    const __m128i vprod0x02 = _mm_add_epi64(_mm_mul_epi32(vacc0x0123, vmultiplier), vrounding);
-    const __m128i vprod1x02 = _mm_add_epi64(_mm_mul_epi32(vacc1x0123, vmultiplier), vrounding);
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
 
-    const __m128i vprod0x13 = _mm_add_epi64(_mm_mul_epi32(vacc0x1133, vmultiplier), vrounding);
-    const __m128i vprod1x13 = _mm_add_epi64(_mm_mul_epi32(vacc1x1133, vmultiplier), vrounding);
-
-    const __m128i vq31prod0x02 = _mm_srli_epi64(vprod0x02, 31);
-    const __m128i vq31prod0x13 = _mm_add_epi64(vprod0x13, vprod0x13);
-    const __m128i vq31prod1x02 = _mm_srli_epi64(vprod1x02, 31);
-    const __m128i vq31prod1x13 = _mm_add_epi64(vprod1x13, vprod1x13);
-
-    const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
-    const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
-
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_mask);
-    const __m128i vrem0x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
-    const __m128i vrem1x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
-
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.shift);
-    vacc0x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
-    vacc1x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
-
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
 
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
 
-    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
-    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_max));
 
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
diff --git a/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-avx.c b/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-avx.c
deleted file mode 100644
index 80f4f6e..0000000
--- a/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-avx.c
+++ /dev/null
@@ -1,198 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/qs8-gemm/MRx4c2-sse.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <smmintrin.h>
-
-#include <xnnpack/gemm.h>
-#include <xnnpack/math.h>
-
-
-
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c2__avx(
-    size_t mr,
-    size_t nc,
-    size_t kc,
-    const int8_t* restrict a,
-    size_t a_stride,
-    const void* restrict w,
-    int8_t* restrict c,
-    size_t cm_stride,
-    size_t cn_stride,
-    const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
-{
-  assert(mr != 0);
-  assert(mr <= 2);
-  assert(nc != 0);
-  assert(kc != 0);
-  assert(kc % sizeof(int8_t) == 0);
-  assert(a != NULL);
-  assert(w != NULL);
-  assert(c != NULL);
-
-  kc = round_up_po2(kc, 2);
-  const int8_t* a0 = a;
-  int8_t* c0 = c;
-  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
-  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
-  if XNN_UNPREDICTABLE(mr != 2) {
-    a1 = a0;
-    c1 = c0;
-  }
-
-  do {
-    __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
-    __m128i vacc1x0123 = vacc0x0123;
-    w = (const void*) ((const int32_t*) w + 4);
-
-    size_t k = kc;
-    while (k >= 8 * sizeof(int8_t)) {
-      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
-      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
-      a0 += 8;
-      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
-      const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
-      a1 += 8;
-
-      const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
-
-      vacc0x0123 = _mm_add_epi32(vacc0x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
-      vacc1x0123 = _mm_add_epi32(vacc1x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
-      const __m128i vxb1 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 8));
-
-      vacc0x0123 = _mm_add_epi32(vacc0x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
-      vacc1x0123 = _mm_add_epi32(vacc1x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
-      const __m128i vxb2 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 16));
-
-      vacc0x0123 = _mm_add_epi32(vacc0x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-      vacc1x0123 = _mm_add_epi32(vacc1x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-      const __m128i vxb3 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 24));
-
-      vacc0x0123 = _mm_add_epi32(vacc0x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-      vacc1x0123 = _mm_add_epi32(vacc1x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-
-      w = (const void*) ((const int16_t*) w + 32);
-      k -= 8 * sizeof(int8_t);
-    }
-    if (k != 0) {
-      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
-      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
-      a0 = (const int8_t*) ((uintptr_t) a0 + k);
-      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
-      const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
-      a1 = (const int8_t*) ((uintptr_t) a1 + k);
-
-      const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
-      w = (const void*) ((const int16_t*) w + 8);
-
-      vacc0x0123 = _mm_add_epi32(vacc0x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
-      vacc1x0123 = _mm_add_epi32(vacc1x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
-
-      if (k > 2 * sizeof(int8_t)) {
-        const __m128i vxb1 = _mm_load_si128((const __m128i*) w);
-        w = (const void*) ((const int16_t*) w + 8);
-
-        vacc0x0123 = _mm_add_epi32(vacc0x0123,
-          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
-        vacc1x0123 = _mm_add_epi32(vacc1x0123,
-          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
-
-        if (k > 4 * sizeof(int8_t)) {
-          const __m128i vxb2 = _mm_load_si128((const __m128i*) w);
-          w = (const void*) ((const int16_t*) w + 8);
-
-          vacc0x0123 = _mm_add_epi32(vacc0x0123,
-            _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-          vacc1x0123 = _mm_add_epi32(vacc1x0123,
-            _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-        }
-      }
-    }
-
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.rounding);
-
-    const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
-    const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
-
-    const __m128i vprod0x02 = _mm_add_epi64(_mm_mul_epi32(vacc0x0123, vmultiplier), vrounding);
-    const __m128i vprod1x02 = _mm_add_epi64(_mm_mul_epi32(vacc1x0123, vmultiplier), vrounding);
-
-    const __m128i vprod0x13 = _mm_add_epi64(_mm_mul_epi32(vacc0x1133, vmultiplier), vrounding);
-    const __m128i vprod1x13 = _mm_add_epi64(_mm_mul_epi32(vacc1x1133, vmultiplier), vrounding);
-
-    const __m128i vq31prod0x02 = _mm_srli_epi64(vprod0x02, 31);
-    const __m128i vq31prod0x13 = _mm_add_epi64(vprod0x13, vprod0x13);
-    const __m128i vq31prod1x02 = _mm_srli_epi64(vprod1x02, 31);
-    const __m128i vq31prod1x13 = _mm_add_epi64(vprod1x13, vprod1x13);
-
-    const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
-    const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
-
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_mask);
-    const __m128i vrem0x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
-    const __m128i vrem1x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
-
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.shift);
-    vacc0x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
-    vacc1x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
-
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_zero_point);
-    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
-
-
-    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
-
-    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
-    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
-
-    if (nc >= 4) {
-      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
-      *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
-
-      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
-      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
-
-      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
-
-      nc -= 4;
-    } else {
-      if (nc & 2) {
-        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
-        c0 += 2;
-        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
-        c1 += 2;
-        vout = _mm_srli_epi32(vout, 16);
-      }
-      if (nc & 1) {
-        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
-        *c1 = (int8_t) _mm_extract_epi8(vout, 4);
-      }
-
-      nc = 0;
-    }
-  } while (nc != 0);
-}
diff --git a/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-sse2.c b/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-sse2.c
deleted file mode 100644
index 3ca4e3f..0000000
--- a/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-sse2.c
+++ /dev/null
@@ -1,224 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/qs8-gemm/MRx4c2-sse.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <emmintrin.h>
-
-#include <xnnpack/gemm.h>
-#include <xnnpack/math.h>
-
-
-
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c2__sse2(
-    size_t mr,
-    size_t nc,
-    size_t kc,
-    const int8_t* restrict a,
-    size_t a_stride,
-    const void* restrict w,
-    int8_t* restrict c,
-    size_t cm_stride,
-    size_t cn_stride,
-    const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
-{
-  assert(mr != 0);
-  assert(mr <= 2);
-  assert(nc != 0);
-  assert(kc != 0);
-  assert(kc % sizeof(int8_t) == 0);
-  assert(a != NULL);
-  assert(w != NULL);
-  assert(c != NULL);
-
-  kc = round_up_po2(kc, 2);
-  const int8_t* a0 = a;
-  int8_t* c0 = c;
-  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
-  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
-  if XNN_UNPREDICTABLE(mr != 2) {
-    a1 = a0;
-    c1 = c0;
-  }
-
-  do {
-    __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
-    __m128i vacc1x0123 = vacc0x0123;
-    w = (const void*) ((const int32_t*) w + 4);
-
-    size_t k = kc;
-    while (k >= 8 * sizeof(int8_t)) {
-      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
-      const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8);
-      a0 += 8;
-      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
-      const __m128i vxa1 = _mm_srai_epi16(_mm_unpacklo_epi8(va1, va1), 8);
-      a1 += 8;
-
-      const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
-
-      vacc0x0123 = _mm_add_epi32(vacc0x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
-      vacc1x0123 = _mm_add_epi32(vacc1x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
-      const __m128i vxb1 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 8));
-
-      vacc0x0123 = _mm_add_epi32(vacc0x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
-      vacc1x0123 = _mm_add_epi32(vacc1x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
-      const __m128i vxb2 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 16));
-
-      vacc0x0123 = _mm_add_epi32(vacc0x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-      vacc1x0123 = _mm_add_epi32(vacc1x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-      const __m128i vxb3 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 24));
-
-      vacc0x0123 = _mm_add_epi32(vacc0x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-      vacc1x0123 = _mm_add_epi32(vacc1x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-
-      w = (const void*) ((const int16_t*) w + 32);
-      k -= 8 * sizeof(int8_t);
-    }
-    if (k != 0) {
-      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
-      const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8);
-      a0 = (const int8_t*) ((uintptr_t) a0 + k);
-      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
-      const __m128i vxa1 = _mm_srai_epi16(_mm_unpacklo_epi8(va1, va1), 8);
-      a1 = (const int8_t*) ((uintptr_t) a1 + k);
-
-      const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
-      w = (const void*) ((const int16_t*) w + 8);
-
-      vacc0x0123 = _mm_add_epi32(vacc0x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
-      vacc1x0123 = _mm_add_epi32(vacc1x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
-
-      if (k > 2 * sizeof(int8_t)) {
-        const __m128i vxb1 = _mm_load_si128((const __m128i*) w);
-        w = (const void*) ((const int16_t*) w + 8);
-
-        vacc0x0123 = _mm_add_epi32(vacc0x0123,
-          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
-        vacc1x0123 = _mm_add_epi32(vacc1x0123,
-          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
-
-        if (k > 4 * sizeof(int8_t)) {
-          const __m128i vxb2 = _mm_load_si128((const __m128i*) w);
-          w = (const void*) ((const int16_t*) w + 8);
-
-          vacc0x0123 = _mm_add_epi32(vacc0x0123,
-            _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-          vacc1x0123 = _mm_add_epi32(vacc1x0123,
-            _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-        }
-      }
-    }
-
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.rounding);
-
-    const __m128i vnmask0x0123 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc0x0123);
-    const __m128i vnmask1x0123 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc1x0123);
-
-    const __m128i vabsacc0x0123 = _mm_sub_epi32(_mm_xor_si128(vacc0x0123, vnmask0x0123), vnmask0x0123);
-    const __m128i vabsacc1x0123 = _mm_sub_epi32(_mm_xor_si128(vacc1x0123, vnmask1x0123), vnmask1x0123);
-
-    const __m128i vabsacc0x1133 = _mm_shuffle_epi32(vabsacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
-    const __m128i vabsacc1x1133 = _mm_shuffle_epi32(vabsacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
-
-    const __m128i vabsprod0x02 = _mm_mul_epu32(vabsacc0x0123, vmultiplier);
-    const __m128i vabsprod1x02 = _mm_mul_epu32(vabsacc1x0123, vmultiplier);
-
-    const __m128i vnmask0x02 = _mm_shuffle_epi32(vnmask0x0123, _MM_SHUFFLE(2, 2, 0, 0));
-    const __m128i vnmask1x02 = _mm_shuffle_epi32(vnmask1x0123, _MM_SHUFFLE(2, 2, 0, 0));
-
-    const __m128i vprod0x02 = _mm_sub_epi64(_mm_xor_si128(vabsprod0x02, vnmask0x02), vnmask0x02);
-    const __m128i vprod1x02 = _mm_sub_epi64(_mm_xor_si128(vabsprod1x02, vnmask1x02), vnmask1x02);
-
-    const __m128i vq31prod0x02 = _mm_srli_epi64(_mm_add_epi64(vprod0x02, vrounding), 31);
-    const __m128i vq31prod1x02 = _mm_srli_epi64(_mm_add_epi64(vprod1x02, vrounding), 31);
-
-    const __m128i vabsprod0x13 = _mm_mul_epu32(vabsacc0x1133, vmultiplier);
-    const __m128i vabsprod1x13 = _mm_mul_epu32(vabsacc1x1133, vmultiplier);
-
-    const __m128i vnmask0x13 = _mm_shuffle_epi32(vnmask0x0123, _MM_SHUFFLE(3, 3, 1, 1));
-    const __m128i vnmask1x13 = _mm_shuffle_epi32(vnmask1x0123, _MM_SHUFFLE(3, 3, 1, 1));
-
-    const __m128i vprod0x13 = _mm_sub_epi64(_mm_xor_si128(vabsprod0x13, vnmask0x13), vnmask0x13);
-    const __m128i vprod1x13 = _mm_sub_epi64(_mm_xor_si128(vabsprod1x13, vnmask1x13), vnmask1x13);
-
-    const __m128i vq31prod0x13 = _mm_srli_epi64(_mm_add_epi64(vprod0x13, vrounding), 31);
-    const __m128i vq31prod1x13 = _mm_srli_epi64(_mm_add_epi64(vprod1x13, vrounding), 31);
-
-    const __m128i vq31prod0x0213 = _mm_castps_si128(_mm_shuffle_ps(
-        _mm_castsi128_ps(vq31prod0x02), _mm_castsi128_ps(vq31prod0x13), _MM_SHUFFLE(2, 0, 2, 0)));
-    const __m128i vq31prod1x0213 = _mm_castps_si128(_mm_shuffle_ps(
-        _mm_castsi128_ps(vq31prod1x02), _mm_castsi128_ps(vq31prod1x13), _MM_SHUFFLE(2, 0, 2, 0)));
-
-    const __m128i vq31prod0x0123 = _mm_shuffle_epi32(vq31prod0x0213, _MM_SHUFFLE(3, 1, 2, 0));
-    const __m128i vq31prod1x0123 = _mm_shuffle_epi32(vq31prod1x0213, _MM_SHUFFLE(3, 1, 2, 0));
-
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.remainder_mask);
-    const __m128i vrem0x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
-    const __m128i vrem1x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
-
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.shift);
-    vacc0x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
-    vacc1x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
-
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_zero_point);
-    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
-
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
-
-    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
-
-
-    if (nc >= 4) {
-      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
-      vout = _mm_srli_si128(vout, 4);
-      *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(vout);
-
-      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
-      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
-
-      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
-
-      nc -= 4;
-    } else {
-      if (nc & 2) {
-        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
-        c0 += 2;
-        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
-        c1 += 2;
-        vout = _mm_srli_epi32(vout, 16);
-      }
-      if (nc & 1) {
-        *c0 = (int8_t) _mm_cvtsi128_si32(vout);
-        *c1 = (int8_t) _mm_extract_epi16(vout, 2);
-      }
-
-      nc = 0;
-    }
-  } while (nc != 0);
-}
diff --git a/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-avx.c b/src/qs8-gemm/gen/2x4c8-xw-minmax-fp32-avx.c
similarity index 65%
copy from src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-avx.c
copy to src/qs8-gemm/gen/2x4c8-xw-minmax-fp32-avx.c
index 573ab1c..6c6da41 100644
--- a/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-avx.c
+++ b/src/qs8-gemm/gen/2x4c8-xw-minmax-fp32-avx.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__avx(
+void xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__avx(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -95,47 +95,24 @@
     __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
     __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.rounding);
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
 
-    const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
-    const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
+    const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale);
 
-    const __m128i vprod0x02 = _mm_add_epi64(_mm_mul_epi32(vacc0x0123, vmultiplier), vrounding);
-    const __m128i vprod1x02 = _mm_add_epi64(_mm_mul_epi32(vacc1x0123, vmultiplier), vrounding);
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
 
-    const __m128i vprod0x13 = _mm_add_epi64(_mm_mul_epi32(vacc0x1133, vmultiplier), vrounding);
-    const __m128i vprod1x13 = _mm_add_epi64(_mm_mul_epi32(vacc1x1133, vmultiplier), vrounding);
-
-    const __m128i vq31prod0x02 = _mm_srli_epi64(vprod0x02, 31);
-    const __m128i vq31prod0x13 = _mm_add_epi64(vprod0x13, vprod0x13);
-    const __m128i vq31prod1x02 = _mm_srli_epi64(vprod1x02, 31);
-    const __m128i vq31prod1x13 = _mm_add_epi64(vprod1x13, vprod1x13);
-
-    const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
-    const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
-
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_mask);
-    const __m128i vrem0x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
-    const __m128i vrem1x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
-
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.shift);
-    vacc0x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
-    vacc1x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
-
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
 
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
 
-    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
-    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_max));
 
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
diff --git a/src/qs8-gemm/gen/2x4c8-xw-minmax-fp32-sse2.c b/src/qs8-gemm/gen/2x4c8-xw-minmax-fp32-sse2.c
new file mode 100644
index 0000000..97b1c93
--- /dev/null
+++ b/src/qs8-gemm/gen/2x4c8-xw-minmax-fp32-sse2.c
@@ -0,0 +1,146 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/MRx4c8-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse2(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 2);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+
+  do {
+    __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
+    __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
+    __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
+    __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
+    __m128i vacc1x0 = vacc0x0;
+    __m128i vacc1x1 = vacc0x1;
+    __m128i vacc1x2 = vacc0x2;
+    __m128i vacc1x3 = vacc0x3;
+    w = (const void*) ((const int32_t*) w + 4);
+
+    size_t k = 0;
+    while (k < kc) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8);
+      a0 += 8;
+      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+      const __m128i vxa1 = _mm_srai_epi16(_mm_unpacklo_epi8(va1, va1), 8);
+      a1 += 8;
+
+      const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
+
+      vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
+      vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
+      const __m128i vxb1 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 8));
+
+      vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
+      vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
+      const __m128i vxb2 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 16));
+
+      vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
+      vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
+      const __m128i vxb3 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 24));
+
+      vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
+      vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
+
+      w = (const void*) ((const int16_t*) w + 32);
+      k += 8 * sizeof(int8_t);
+    }
+
+    const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2));
+    const __m128i vacc0x13 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x1, vacc0x3), _mm_unpackhi_epi32(vacc0x1, vacc0x3));
+    const __m128i vacc1x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc1x0, vacc1x2), _mm_unpackhi_epi32(vacc1x0, vacc1x2));
+    const __m128i vacc1x13 = _mm_add_epi32(_mm_unpacklo_epi32(vacc1x1, vacc1x3), _mm_unpackhi_epi32(vacc1x1, vacc1x3));
+
+    __m128i vacc0x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x02, vacc0x13), _mm_unpackhi_epi32(vacc0x02, vacc0x13));
+    __m128i vacc1x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc1x02, vacc1x13), _mm_unpackhi_epi32(vacc1x02, vacc1x13));
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+
+    const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+
+    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
+    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->fp32_sse2.output_max);
+    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
+
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      vout = _mm_srli_si128(vout, 4);
+      *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(vout);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *c0 = (int8_t) _mm_cvtsi128_si32(vout);
+        *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-avx.c b/src/qs8-gemm/gen/2x4c8-xw-minmax-fp32-sse41.c
similarity index 65%
rename from src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-avx.c
rename to src/qs8-gemm/gen/2x4c8-xw-minmax-fp32-sse41.c
index 573ab1c..78ca719 100644
--- a/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-avx.c
+++ b/src/qs8-gemm/gen/2x4c8-xw-minmax-fp32-sse41.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__avx(
+void xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse41(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -95,47 +95,24 @@
     __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
     __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.rounding);
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
 
-    const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
-    const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
+    const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale);
 
-    const __m128i vprod0x02 = _mm_add_epi64(_mm_mul_epi32(vacc0x0123, vmultiplier), vrounding);
-    const __m128i vprod1x02 = _mm_add_epi64(_mm_mul_epi32(vacc1x0123, vmultiplier), vrounding);
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
 
-    const __m128i vprod0x13 = _mm_add_epi64(_mm_mul_epi32(vacc0x1133, vmultiplier), vrounding);
-    const __m128i vprod1x13 = _mm_add_epi64(_mm_mul_epi32(vacc1x1133, vmultiplier), vrounding);
-
-    const __m128i vq31prod0x02 = _mm_srli_epi64(vprod0x02, 31);
-    const __m128i vq31prod0x13 = _mm_add_epi64(vprod0x13, vprod0x13);
-    const __m128i vq31prod1x02 = _mm_srli_epi64(vprod1x02, 31);
-    const __m128i vq31prod1x13 = _mm_add_epi64(vprod1x13, vprod1x13);
-
-    const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
-    const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
-
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_mask);
-    const __m128i vrem0x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
-    const __m128i vrem1x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
-
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.shift);
-    vacc0x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
-    vacc1x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
-
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
 
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
 
-    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
-    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_max));
 
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
diff --git a/src/qs8-gemm/gen/2x4c8-xw-minmax-fp32-ssse3.c b/src/qs8-gemm/gen/2x4c8-xw-minmax-fp32-ssse3.c
new file mode 100644
index 0000000..1287f3d
--- /dev/null
+++ b/src/qs8-gemm/gen/2x4c8-xw-minmax-fp32-ssse3.c
@@ -0,0 +1,146 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/MRx4c8-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <tmmintrin.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__ssse3(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 2);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+
+  do {
+    __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
+    __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
+    __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
+    __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
+    __m128i vacc1x0 = vacc0x0;
+    __m128i vacc1x1 = vacc0x1;
+    __m128i vacc1x2 = vacc0x2;
+    __m128i vacc1x3 = vacc0x3;
+    w = (const void*) ((const int32_t*) w + 4);
+
+    size_t k = 0;
+    while (k < kc) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8);
+      a0 += 8;
+      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+      const __m128i vxa1 = _mm_srai_epi16(_mm_unpacklo_epi8(va1, va1), 8);
+      a1 += 8;
+
+      const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
+
+      vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
+      vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
+      const __m128i vxb1 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 8));
+
+      vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
+      vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
+      const __m128i vxb2 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 16));
+
+      vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
+      vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
+      const __m128i vxb3 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 24));
+
+      vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
+      vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
+
+      w = (const void*) ((const int16_t*) w + 32);
+      k += 8 * sizeof(int8_t);
+    }
+
+    const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
+    const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
+    const __m128i vacc1x01 = _mm_hadd_epi32(vacc1x0, vacc1x1);
+    const __m128i vacc1x23 = _mm_hadd_epi32(vacc1x2, vacc1x3);
+
+    __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
+    __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+
+    const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+
+    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
+    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->fp32_sse2.output_max);
+    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
+
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      vout = _mm_srli_si128(vout, 4);
+      *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(vout);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *c0 = (int8_t) _mm_cvtsi128_si32(vout);
+        *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/2x4c8-xw-minmax-fp32-wasmsimd.c b/src/qs8-gemm/gen/2x4c8-xw-minmax-fp32-wasmsimd.c
new file mode 100644
index 0000000..cada9f0
--- /dev/null
+++ b/src/qs8-gemm/gen/2x4c8-xw-minmax-fp32-wasmsimd.c
@@ -0,0 +1,179 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/MRx4c8-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__wasmsimd(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 2);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+
+  const v128_t vzero = wasm_f64x2_splat(0.0);
+  do {
+    v128_t vacc0x0 = wasm_f32x4_replace_lane(vzero, 0, ((const float*) w)[0]);
+    v128_t vacc0x1 = wasm_f32x4_replace_lane(vzero, 0, ((const float*) w)[1]);
+    v128_t vacc0x2 = wasm_f32x4_replace_lane(vzero, 0, ((const float*) w)[2]);
+    v128_t vacc0x3 = wasm_f32x4_replace_lane(vzero, 0, ((const float*) w)[3]);
+    v128_t vacc1x0 = vacc0x0;
+    v128_t vacc1x1 = vacc0x1;
+    v128_t vacc1x2 = vacc0x2;
+    v128_t vacc1x3 = vacc0x3;
+    w = (const void*) ((const int32_t*) w + 4);
+
+    size_t k = 0;
+    while (k < kc) {
+      const v128_t vxa0 = wasm_i16x8_load8x8(a0);
+      a0 += 8;
+      const v128_t vxa1 = wasm_i16x8_load8x8(a1);
+      a1 += 8;
+
+      const v128_t vxb0 = wasm_v128_load(w);
+
+      const v128_t vprod0x0 = wasm_i16x8_mul(vxa0, vxb0);
+      vacc0x0 = wasm_i32x4_add(vacc0x0, wasm_i32x4_extend_low_i16x8(vprod0x0));
+      vacc0x0 = wasm_i32x4_add(vacc0x0, wasm_i32x4_extend_high_i16x8(vprod0x0));
+      const v128_t vprod1x0 = wasm_i16x8_mul(vxa1, vxb0);
+      vacc1x0 = wasm_i32x4_add(vacc1x0, wasm_i32x4_extend_low_i16x8(vprod1x0));
+      vacc1x0 = wasm_i32x4_add(vacc1x0, wasm_i32x4_extend_high_i16x8(vprod1x0));
+      const v128_t vxb1 = wasm_v128_load((const int16_t*) w + 8);
+
+      const v128_t vprod0x1 = wasm_i16x8_mul(vxa0, vxb1);
+      vacc0x1 = wasm_i32x4_add(vacc0x1, wasm_i32x4_extend_low_i16x8(vprod0x1));
+      vacc0x1 = wasm_i32x4_add(vacc0x1, wasm_i32x4_extend_high_i16x8(vprod0x1));
+      const v128_t vprod1x1 = wasm_i16x8_mul(vxa1, vxb1);
+      vacc1x1 = wasm_i32x4_add(vacc1x1, wasm_i32x4_extend_low_i16x8(vprod1x1));
+      vacc1x1 = wasm_i32x4_add(vacc1x1, wasm_i32x4_extend_high_i16x8(vprod1x1));
+      const v128_t vxb2 = wasm_v128_load((const int16_t*) w + 16);
+
+      const v128_t vprod0x2 = wasm_i16x8_mul(vxa0, vxb2);
+      vacc0x2 = wasm_i32x4_add(vacc0x2, wasm_i32x4_extend_low_i16x8(vprod0x2));
+      vacc0x2 = wasm_i32x4_add(vacc0x2, wasm_i32x4_extend_high_i16x8(vprod0x2));
+      const v128_t vprod1x2 = wasm_i16x8_mul(vxa1, vxb2);
+      vacc1x2 = wasm_i32x4_add(vacc1x2, wasm_i32x4_extend_low_i16x8(vprod1x2));
+      vacc1x2 = wasm_i32x4_add(vacc1x2, wasm_i32x4_extend_high_i16x8(vprod1x2));
+      const v128_t vxb3 = wasm_v128_load((const int16_t*) w + 24);
+
+      const v128_t vprod0x3 = wasm_i16x8_mul(vxa0, vxb3);
+      vacc0x3 = wasm_i32x4_add(vacc0x3, wasm_i32x4_extend_low_i16x8(vprod0x3));
+      vacc0x3 = wasm_i32x4_add(vacc0x3, wasm_i32x4_extend_high_i16x8(vprod0x3));
+      const v128_t vprod1x3 = wasm_i16x8_mul(vxa1, vxb3);
+      vacc1x3 = wasm_i32x4_add(vacc1x3, wasm_i32x4_extend_low_i16x8(vprod1x3));
+      vacc1x3 = wasm_i32x4_add(vacc1x3, wasm_i32x4_extend_high_i16x8(vprod1x3));
+
+      w = (const void*) ((const int16_t*) w + 32);
+      k += 8 * sizeof(int8_t);
+    }
+
+    const v128_t vacc0x02 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x0, vacc0x2, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x0, vacc0x2, 2, 6, 3, 7));
+    const v128_t vacc0x13 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x1, vacc0x3, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x1, vacc0x3, 2, 6, 3, 7));
+    const v128_t vacc1x02 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc1x0, vacc1x2, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc1x0, vacc1x2, 2, 6, 3, 7));
+    const v128_t vacc1x13 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc1x1, vacc1x3, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc1x1, vacc1x3, 2, 6, 3, 7));
+
+    v128_t vacc0x0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x02, vacc0x13, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x02, vacc0x13, 2, 6, 3, 7));
+    v128_t vacc1x0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc1x02, vacc1x13, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc1x02, vacc1x13, 2, 6, 3, 7));
+
+    const v128_t vsign0x0123 = wasm_i32x4_shr(vacc0x0123, 31);
+    const v128_t vsign1x0123 = wasm_i32x4_shr(vacc1x0123, 31);
+
+    const v128_t vacc0x01 = wasm_v32x4_shuffle(vacc0x0123, vsign0x0123, 0, 4, 1, 5);
+    const v128_t vacc1x01 = wasm_v32x4_shuffle(vacc1x0123, vsign1x0123, 0, 4, 1, 5);
+
+    const v128_t vmultiplier = wasm_v128_load(params->gemmlowp_wasmsimd.multiplier);
+    const v128_t vrounding = wasm_v128_load(params->gemmlowp_wasmsimd.rounding);
+    const v128_t vprod0x01 = wasm_i64x2_add(wasm_i64x2_mul(vacc0x01, vmultiplier), vrounding);
+    const v128_t vacc0x23 = wasm_v32x4_shuffle(vacc0x0123, vsign0x0123, 2, 6, 3, 7);
+    const v128_t vprod1x01 = wasm_i64x2_add(wasm_i64x2_mul(vacc1x01, vmultiplier), vrounding);
+    const v128_t vacc1x23 = wasm_v32x4_shuffle(vacc1x0123, vsign1x0123, 2, 6, 3, 7);
+
+    const v128_t vprod0x23 = wasm_i64x2_add(wasm_i64x2_mul(vacc0x23, vmultiplier), vrounding);
+    const v128_t vprod1x23 = wasm_i64x2_add(wasm_i64x2_mul(vacc1x23, vmultiplier), vrounding);
+
+    const v128_t vq31prod0x0123 = wasm_v32x4_shuffle(vprod0x01, vprod0x23, 1, 3, 5, 7);
+    const v128_t vq31prod1x0123 = wasm_v32x4_shuffle(vprod1x01, vprod1x23, 1, 3, 5, 7);
+
+    const v128_t vremainder_mask = wasm_v128_load(params->gemmlowp_wasmsimd.remainder_mask);
+    const v128_t vrem0x0123 = wasm_i32x4_add(wasm_v128_and(vq31prod0x0123, vremainder_mask), wasm_i32x4_shr(vq31prod0x0123, 31));
+    const v128_t vrem1x0123 = wasm_i32x4_add(wasm_v128_and(vq31prod1x0123, vremainder_mask), wasm_i32x4_shr(vq31prod1x0123, 31));
+
+    const v128_t vthreshold = wasm_v128_load(params->gemmlowp_wasmsimd.remainder_threshold);
+    const int32_t vshift = params->gemmlowp_wasmsimd.shift;
+    vacc0x0123 = wasm_i32x4_sub(wasm_i32x4_shr(vq31prod0x0123, vshift), wasm_i32x4_gt(vrem0x0123, vthreshold));
+    vacc1x0123 = wasm_i32x4_sub(wasm_i32x4_shr(vq31prod1x0123, vshift), wasm_i32x4_gt(vrem1x0123, vthreshold));
+
+    const v128_t voutput_zero_point = wasm_v128_load(params->gemmlowp_wasmsimd.output_zero_point);
+    v128_t vacc01x0123 = wasm_i16x8_add_sat(wasm_i16x8_narrow_i32x4(vacc0x0123, vacc1x0123), voutput_zero_point);
+
+    v128_t vout = wasm_i8x16_narrow_i16x8(vacc01x0123, vacc01x0123);
+
+    const v128_t voutput_min = wasm_v128_load(params->gemmlowp_wasmsimd.output_min);
+    vout = wasm_i8x16_max(vout, voutput_min);
+
+    const v128_t voutput_max = wasm_v128_load(params->gemmlowp_wasmsimd.output_max);
+    vout = wasm_i8x16_min(vout, voutput_max);
+
+    if (nc >= 4) {
+      *((float*) c0) = (float) wasm_f32x4_extract_lane(vout, 0);
+      *((float*) c1) = (float) wasm_f32x4_extract_lane(vout, 1);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) wasm_i16x8_extract_lane(vout, 0);
+        c0 += 2;
+        *((uint16_t*) c1) = (uint16_t) wasm_i16x8_extract_lane(vout, 2);
+        c1 += 2;
+        vout = wasm_u32x4_shr(vout, 16);
+      }
+      if (nc & 1) {
+        *c0 = (int8_t) wasm_i8x16_extract_lane(vout, 0);
+        *c1 = (int8_t) wasm_i8x16_extract_lane(vout, 4);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-xop.c b/src/qs8-gemm/gen/2x4c8-xw-minmax-fp32-xop.c
similarity index 65%
rename from src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-xop.c
rename to src/qs8-gemm/gen/2x4c8-xw-minmax-fp32-xop.c
index 54a91c4..3221462 100644
--- a/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-xop.c
+++ b/src/qs8-gemm/gen/2x4c8-xw-minmax-fp32-xop.c
@@ -20,7 +20,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__xop(
+void xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__xop(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -100,47 +100,24 @@
     __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
     __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.rounding);
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
 
-    const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
-    const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
+    const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale);
 
-    const __m128i vprod0x02 = _mm_add_epi64(_mm_mul_epi32(vacc0x0123, vmultiplier), vrounding);
-    const __m128i vprod1x02 = _mm_add_epi64(_mm_mul_epi32(vacc1x0123, vmultiplier), vrounding);
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
 
-    const __m128i vprod0x13 = _mm_add_epi64(_mm_mul_epi32(vacc0x1133, vmultiplier), vrounding);
-    const __m128i vprod1x13 = _mm_add_epi64(_mm_mul_epi32(vacc1x1133, vmultiplier), vrounding);
-
-    const __m128i vq31prod0x02 = _mm_srli_epi64(vprod0x02, 31);
-    const __m128i vq31prod0x13 = _mm_add_epi64(vprod0x13, vprod0x13);
-    const __m128i vq31prod1x02 = _mm_srli_epi64(vprod1x02, 31);
-    const __m128i vq31prod1x13 = _mm_add_epi64(vprod1x13, vprod1x13);
-
-    const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
-    const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
-
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_mask);
-    const __m128i vrem0x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
-    const __m128i vrem1x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
-
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.shift);
-    vacc0x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
-    vacc1x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
-
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
 
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
 
-    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
-    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_max));
 
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
diff --git a/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-sse2.c b/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-sse2.c
deleted file mode 100644
index 31f4048..0000000
--- a/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-sse2.c
+++ /dev/null
@@ -1,193 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/qs8-gemm/MRx4c8-sse.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <emmintrin.h>
-
-#include <xnnpack/gemm.h>
-#include <xnnpack/math.h>
-
-
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse2(
-    size_t mr,
-    size_t nc,
-    size_t kc,
-    const int8_t* restrict a,
-    size_t a_stride,
-    const void* restrict w,
-    int8_t* restrict c,
-    size_t cm_stride,
-    size_t cn_stride,
-    const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
-{
-  assert(mr != 0);
-  assert(mr <= 2);
-  assert(nc != 0);
-  assert(kc != 0);
-  assert(kc % sizeof(int8_t) == 0);
-  assert(a != NULL);
-  assert(w != NULL);
-  assert(c != NULL);
-
-  kc = round_up_po2(kc, 8);
-  const int8_t* a0 = a;
-  int8_t* c0 = c;
-  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
-  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
-  if XNN_UNPREDICTABLE(mr != 2) {
-    a1 = a0;
-    c1 = c0;
-  }
-
-  do {
-    __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
-    __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
-    __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
-    __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
-    __m128i vacc1x0 = vacc0x0;
-    __m128i vacc1x1 = vacc0x1;
-    __m128i vacc1x2 = vacc0x2;
-    __m128i vacc1x3 = vacc0x3;
-    w = (const void*) ((const int32_t*) w + 4);
-
-    size_t k = 0;
-    while (k < kc) {
-      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
-      const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8);
-      a0 += 8;
-      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
-      const __m128i vxa1 = _mm_srai_epi16(_mm_unpacklo_epi8(va1, va1), 8);
-      a1 += 8;
-
-      const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
-
-      vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
-      vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
-      const __m128i vxb1 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 8));
-
-      vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
-      vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
-      const __m128i vxb2 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 16));
-
-      vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
-      vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
-      const __m128i vxb3 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 24));
-
-      vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
-      vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
-
-      w = (const void*) ((const int16_t*) w + 32);
-      k += 8 * sizeof(int8_t);
-    }
-
-    const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2));
-    const __m128i vacc0x13 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x1, vacc0x3), _mm_unpackhi_epi32(vacc0x1, vacc0x3));
-    const __m128i vacc1x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc1x0, vacc1x2), _mm_unpackhi_epi32(vacc1x0, vacc1x2));
-    const __m128i vacc1x13 = _mm_add_epi32(_mm_unpacklo_epi32(vacc1x1, vacc1x3), _mm_unpackhi_epi32(vacc1x1, vacc1x3));
-
-    __m128i vacc0x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x02, vacc0x13), _mm_unpackhi_epi32(vacc0x02, vacc0x13));
-    __m128i vacc1x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc1x02, vacc1x13), _mm_unpackhi_epi32(vacc1x02, vacc1x13));
-
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.rounding);
-
-    const __m128i vnmask0x0123 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc0x0123);
-    const __m128i vnmask1x0123 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc1x0123);
-
-    const __m128i vabsacc0x0123 = _mm_sub_epi32(_mm_xor_si128(vacc0x0123, vnmask0x0123), vnmask0x0123);
-    const __m128i vabsacc1x0123 = _mm_sub_epi32(_mm_xor_si128(vacc1x0123, vnmask1x0123), vnmask1x0123);
-
-    const __m128i vabsacc0x1133 = _mm_shuffle_epi32(vabsacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
-    const __m128i vabsacc1x1133 = _mm_shuffle_epi32(vabsacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
-
-    const __m128i vabsprod0x02 = _mm_mul_epu32(vabsacc0x0123, vmultiplier);
-    const __m128i vabsprod1x02 = _mm_mul_epu32(vabsacc1x0123, vmultiplier);
-
-    const __m128i vnmask0x02 = _mm_shuffle_epi32(vnmask0x0123, _MM_SHUFFLE(2, 2, 0, 0));
-    const __m128i vnmask1x02 = _mm_shuffle_epi32(vnmask1x0123, _MM_SHUFFLE(2, 2, 0, 0));
-
-    const __m128i vprod0x02 = _mm_sub_epi64(_mm_xor_si128(vabsprod0x02, vnmask0x02), vnmask0x02);
-    const __m128i vprod1x02 = _mm_sub_epi64(_mm_xor_si128(vabsprod1x02, vnmask1x02), vnmask1x02);
-
-    const __m128i vq31prod0x02 = _mm_srli_epi64(_mm_add_epi64(vprod0x02, vrounding), 31);
-    const __m128i vq31prod1x02 = _mm_srli_epi64(_mm_add_epi64(vprod1x02, vrounding), 31);
-
-    const __m128i vabsprod0x13 = _mm_mul_epu32(vabsacc0x1133, vmultiplier);
-    const __m128i vabsprod1x13 = _mm_mul_epu32(vabsacc1x1133, vmultiplier);
-
-    const __m128i vnmask0x13 = _mm_shuffle_epi32(vnmask0x0123, _MM_SHUFFLE(3, 3, 1, 1));
-    const __m128i vnmask1x13 = _mm_shuffle_epi32(vnmask1x0123, _MM_SHUFFLE(3, 3, 1, 1));
-
-    const __m128i vprod0x13 = _mm_sub_epi64(_mm_xor_si128(vabsprod0x13, vnmask0x13), vnmask0x13);
-    const __m128i vprod1x13 = _mm_sub_epi64(_mm_xor_si128(vabsprod1x13, vnmask1x13), vnmask1x13);
-
-    const __m128i vq31prod0x13 = _mm_srli_epi64(_mm_add_epi64(vprod0x13, vrounding), 31);
-    const __m128i vq31prod1x13 = _mm_srli_epi64(_mm_add_epi64(vprod1x13, vrounding), 31);
-
-    const __m128i vq31prod0x0213 = _mm_castps_si128(_mm_shuffle_ps(
-        _mm_castsi128_ps(vq31prod0x02), _mm_castsi128_ps(vq31prod0x13), _MM_SHUFFLE(2, 0, 2, 0)));
-    const __m128i vq31prod1x0213 = _mm_castps_si128(_mm_shuffle_ps(
-        _mm_castsi128_ps(vq31prod1x02), _mm_castsi128_ps(vq31prod1x13), _MM_SHUFFLE(2, 0, 2, 0)));
-
-    const __m128i vq31prod0x0123 = _mm_shuffle_epi32(vq31prod0x0213, _MM_SHUFFLE(3, 1, 2, 0));
-    const __m128i vq31prod1x0123 = _mm_shuffle_epi32(vq31prod1x0213, _MM_SHUFFLE(3, 1, 2, 0));
-
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.remainder_mask);
-    const __m128i vrem0x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
-    const __m128i vrem1x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
-
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.shift);
-    vacc0x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
-    vacc1x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
-
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_zero_point);
-    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
-
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
-
-    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
-
-
-    if (nc >= 4) {
-      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
-      vout = _mm_srli_si128(vout, 4);
-      *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(vout);
-
-      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
-      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
-
-      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
-
-      nc -= 4;
-    } else {
-      if (nc & 2) {
-        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
-        c0 += 2;
-        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
-        c1 += 2;
-        vout = _mm_srli_epi32(vout, 16);
-      }
-      if (nc & 1) {
-        *c0 = (int8_t) _mm_cvtsi128_si32(vout);
-        *c1 = (int8_t) _mm_extract_epi16(vout, 2);
-      }
-
-      nc = 0;
-    }
-  } while (nc != 0);
-}
diff --git a/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-sse41.c b/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-sse41.c
deleted file mode 100644
index a448b3f..0000000
--- a/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-sse41.c
+++ /dev/null
@@ -1,167 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/qs8-gemm/MRx4c8-sse.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <smmintrin.h>
-
-#include <xnnpack/gemm.h>
-#include <xnnpack/math.h>
-
-
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse41(
-    size_t mr,
-    size_t nc,
-    size_t kc,
-    const int8_t* restrict a,
-    size_t a_stride,
-    const void* restrict w,
-    int8_t* restrict c,
-    size_t cm_stride,
-    size_t cn_stride,
-    const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
-{
-  assert(mr != 0);
-  assert(mr <= 2);
-  assert(nc != 0);
-  assert(kc != 0);
-  assert(kc % sizeof(int8_t) == 0);
-  assert(a != NULL);
-  assert(w != NULL);
-  assert(c != NULL);
-
-  kc = round_up_po2(kc, 8);
-  const int8_t* a0 = a;
-  int8_t* c0 = c;
-  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
-  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
-  if XNN_UNPREDICTABLE(mr != 2) {
-    a1 = a0;
-    c1 = c0;
-  }
-
-  do {
-    __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
-    __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
-    __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
-    __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
-    __m128i vacc1x0 = vacc0x0;
-    __m128i vacc1x1 = vacc0x1;
-    __m128i vacc1x2 = vacc0x2;
-    __m128i vacc1x3 = vacc0x3;
-    w = (const void*) ((const int32_t*) w + 4);
-
-    size_t k = 0;
-    while (k < kc) {
-      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
-      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
-      a0 += 8;
-      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
-      const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
-      a1 += 8;
-
-      const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
-
-      vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
-      vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
-      const __m128i vxb1 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 8));
-
-      vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
-      vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
-      const __m128i vxb2 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 16));
-
-      vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
-      vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
-      const __m128i vxb3 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 24));
-
-      vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
-      vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
-
-      w = (const void*) ((const int16_t*) w + 32);
-      k += 8 * sizeof(int8_t);
-    }
-
-    const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
-    const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
-    const __m128i vacc1x01 = _mm_hadd_epi32(vacc1x0, vacc1x1);
-    const __m128i vacc1x23 = _mm_hadd_epi32(vacc1x2, vacc1x3);
-
-    __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
-    __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
-
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.rounding);
-
-    const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
-    const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
-
-    const __m128i vprod0x02 = _mm_add_epi64(_mm_mul_epi32(vacc0x0123, vmultiplier), vrounding);
-    const __m128i vprod1x02 = _mm_add_epi64(_mm_mul_epi32(vacc1x0123, vmultiplier), vrounding);
-
-    const __m128i vprod0x13 = _mm_add_epi64(_mm_mul_epi32(vacc0x1133, vmultiplier), vrounding);
-    const __m128i vprod1x13 = _mm_add_epi64(_mm_mul_epi32(vacc1x1133, vmultiplier), vrounding);
-
-    const __m128i vq31prod0x02 = _mm_srli_epi64(vprod0x02, 31);
-    const __m128i vq31prod0x13 = _mm_add_epi64(vprod0x13, vprod0x13);
-    const __m128i vq31prod1x02 = _mm_srli_epi64(vprod1x02, 31);
-    const __m128i vq31prod1x13 = _mm_add_epi64(vprod1x13, vprod1x13);
-
-    const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
-    const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
-
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_mask);
-    const __m128i vrem0x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
-    const __m128i vrem1x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
-
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.shift);
-    vacc0x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
-    vacc1x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
-
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_zero_point);
-    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
-
-
-    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
-
-    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
-    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
-
-    if (nc >= 4) {
-      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
-      *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
-
-      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
-      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
-
-      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
-
-      nc -= 4;
-    } else {
-      if (nc & 2) {
-        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
-        c0 += 2;
-        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
-        c1 += 2;
-        vout = _mm_srli_epi32(vout, 16);
-      }
-      if (nc & 1) {
-        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
-        *c1 = (int8_t) _mm_extract_epi8(vout, 4);
-      }
-
-      nc = 0;
-    }
-  } while (nc != 0);
-}
diff --git a/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-ssse3.c b/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-ssse3.c
deleted file mode 100644
index 9e275fa..0000000
--- a/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-ssse3.c
+++ /dev/null
@@ -1,193 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/qs8-gemm/MRx4c8-sse.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <tmmintrin.h>
-
-#include <xnnpack/gemm.h>
-#include <xnnpack/math.h>
-
-
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__ssse3(
-    size_t mr,
-    size_t nc,
-    size_t kc,
-    const int8_t* restrict a,
-    size_t a_stride,
-    const void* restrict w,
-    int8_t* restrict c,
-    size_t cm_stride,
-    size_t cn_stride,
-    const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
-{
-  assert(mr != 0);
-  assert(mr <= 2);
-  assert(nc != 0);
-  assert(kc != 0);
-  assert(kc % sizeof(int8_t) == 0);
-  assert(a != NULL);
-  assert(w != NULL);
-  assert(c != NULL);
-
-  kc = round_up_po2(kc, 8);
-  const int8_t* a0 = a;
-  int8_t* c0 = c;
-  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
-  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
-  if XNN_UNPREDICTABLE(mr != 2) {
-    a1 = a0;
-    c1 = c0;
-  }
-
-  do {
-    __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
-    __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
-    __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
-    __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
-    __m128i vacc1x0 = vacc0x0;
-    __m128i vacc1x1 = vacc0x1;
-    __m128i vacc1x2 = vacc0x2;
-    __m128i vacc1x3 = vacc0x3;
-    w = (const void*) ((const int32_t*) w + 4);
-
-    size_t k = 0;
-    while (k < kc) {
-      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
-      const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8);
-      a0 += 8;
-      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
-      const __m128i vxa1 = _mm_srai_epi16(_mm_unpacklo_epi8(va1, va1), 8);
-      a1 += 8;
-
-      const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
-
-      vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
-      vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
-      const __m128i vxb1 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 8));
-
-      vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
-      vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
-      const __m128i vxb2 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 16));
-
-      vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
-      vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
-      const __m128i vxb3 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 24));
-
-      vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
-      vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
-
-      w = (const void*) ((const int16_t*) w + 32);
-      k += 8 * sizeof(int8_t);
-    }
-
-    const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
-    const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
-    const __m128i vacc1x01 = _mm_hadd_epi32(vacc1x0, vacc1x1);
-    const __m128i vacc1x23 = _mm_hadd_epi32(vacc1x2, vacc1x3);
-
-    __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
-    __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
-
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.rounding);
-
-    const __m128i vnmask0x0123 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc0x0123);
-    const __m128i vnmask1x0123 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc1x0123);
-
-    const __m128i vabsacc0x0123 = _mm_abs_epi32(vacc0x0123);
-    const __m128i vabsacc1x0123 = _mm_abs_epi32(vacc1x0123);
-
-    const __m128i vabsacc0x1133 = _mm_shuffle_epi32(vabsacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
-    const __m128i vabsacc1x1133 = _mm_shuffle_epi32(vabsacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
-
-    const __m128i vabsprod0x02 = _mm_mul_epu32(vabsacc0x0123, vmultiplier);
-    const __m128i vabsprod1x02 = _mm_mul_epu32(vabsacc1x0123, vmultiplier);
-
-    const __m128i vnmask0x02 = _mm_shuffle_epi32(vnmask0x0123, _MM_SHUFFLE(2, 2, 0, 0));
-    const __m128i vnmask1x02 = _mm_shuffle_epi32(vnmask1x0123, _MM_SHUFFLE(2, 2, 0, 0));
-
-    const __m128i vprod0x02 = _mm_sub_epi64(_mm_xor_si128(vabsprod0x02, vnmask0x02), vnmask0x02);
-    const __m128i vprod1x02 = _mm_sub_epi64(_mm_xor_si128(vabsprod1x02, vnmask1x02), vnmask1x02);
-
-    const __m128i vq31prod0x02 = _mm_srli_epi64(_mm_add_epi64(vprod0x02, vrounding), 31);
-    const __m128i vq31prod1x02 = _mm_srli_epi64(_mm_add_epi64(vprod1x02, vrounding), 31);
-
-    const __m128i vabsprod0x13 = _mm_mul_epu32(vabsacc0x1133, vmultiplier);
-    const __m128i vabsprod1x13 = _mm_mul_epu32(vabsacc1x1133, vmultiplier);
-
-    const __m128i vnmask0x13 = _mm_shuffle_epi32(vnmask0x0123, _MM_SHUFFLE(3, 3, 1, 1));
-    const __m128i vnmask1x13 = _mm_shuffle_epi32(vnmask1x0123, _MM_SHUFFLE(3, 3, 1, 1));
-
-    const __m128i vprod0x13 = _mm_sub_epi64(_mm_xor_si128(vabsprod0x13, vnmask0x13), vnmask0x13);
-    const __m128i vprod1x13 = _mm_sub_epi64(_mm_xor_si128(vabsprod1x13, vnmask1x13), vnmask1x13);
-
-    const __m128i vq31prod0x13 = _mm_srli_epi64(_mm_add_epi64(vprod0x13, vrounding), 31);
-    const __m128i vq31prod1x13 = _mm_srli_epi64(_mm_add_epi64(vprod1x13, vrounding), 31);
-
-    const __m128i vq31prod0x0213 = _mm_castps_si128(_mm_shuffle_ps(
-        _mm_castsi128_ps(vq31prod0x02), _mm_castsi128_ps(vq31prod0x13), _MM_SHUFFLE(2, 0, 2, 0)));
-    const __m128i vq31prod1x0213 = _mm_castps_si128(_mm_shuffle_ps(
-        _mm_castsi128_ps(vq31prod1x02), _mm_castsi128_ps(vq31prod1x13), _MM_SHUFFLE(2, 0, 2, 0)));
-
-    const __m128i vq31prod0x0123 = _mm_shuffle_epi32(vq31prod0x0213, _MM_SHUFFLE(3, 1, 2, 0));
-    const __m128i vq31prod1x0123 = _mm_shuffle_epi32(vq31prod1x0213, _MM_SHUFFLE(3, 1, 2, 0));
-
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.remainder_mask);
-    const __m128i vrem0x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
-    const __m128i vrem1x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
-
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.shift);
-    vacc0x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
-    vacc1x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
-
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_zero_point);
-    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
-
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
-
-    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
-
-
-    if (nc >= 4) {
-      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
-      vout = _mm_srli_si128(vout, 4);
-      *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(vout);
-
-      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
-      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
-
-      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
-
-      nc -= 4;
-    } else {
-      if (nc & 2) {
-        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
-        c0 += 2;
-        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
-        c1 += 2;
-        vout = _mm_srli_epi32(vout, 16);
-      }
-      if (nc & 1) {
-        *c0 = (int8_t) _mm_cvtsi128_si32(vout);
-        *c1 = (int8_t) _mm_extract_epi16(vout, 2);
-      }
-
-      nc = 0;
-    }
-  } while (nc != 0);
-}
diff --git a/src/qs8-gemm/gen/2x8c8-xw-minmax-gemmlowp-avx2.c b/src/qs8-gemm/gen/2x8c8-xw-minmax-gemmlowp-avx2.c
deleted file mode 100644
index 684ea57..0000000
--- a/src/qs8-gemm/gen/2x8c8-xw-minmax-gemmlowp-avx2.c
+++ /dev/null
@@ -1,197 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/qs8-gemm/MRx8c8-avx2.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <immintrin.h>
-
-#include <xnnpack/gemm.h>
-#include <xnnpack/intrinsics-polyfill.h>
-#include <xnnpack/math.h>
-
-
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x8c8__avx2(
-    size_t mr,
-    size_t nc,
-    size_t kc,
-    const int8_t* restrict a,
-    size_t a_stride,
-    const void* restrict w,
-    int8_t* restrict c,
-    size_t cm_stride,
-    size_t cn_stride,
-    const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
-{
-  assert(mr != 0);
-  assert(mr <= 2);
-  assert(nc != 0);
-  assert(kc != 0);
-  assert(kc % sizeof(int8_t) == 0);
-  assert(a != NULL);
-  assert(w != NULL);
-  assert(c != NULL);
-
-  kc = round_up_po2(kc, 8);
-  const int8_t* a0 = a;
-  int8_t* c0 = c;
-  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
-  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
-  if XNN_UNPREDICTABLE(mr != 2) {
-    a1 = a0;
-    c1 = c0;
-  }
-
-  do {
-    const __m128i vbias0x0 = _mm_loadu_si32(w);
-    const __m128i vbias0x1 = _mm_loadu_si32((const int32_t*) w + 1);
-    __m256i vacc0x01 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x0), vbias0x1, 1);
-    const __m128i vbias0x2 = _mm_loadu_si32((const int32_t*) w + 2);
-    const __m128i vbias0x3 = _mm_loadu_si32((const int32_t*) w + 3);
-    __m256i vacc0x23 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x2), vbias0x3, 1);
-    const __m128i vbias0x4 = _mm_loadu_si32((const int32_t*) w + 4);
-    const __m128i vbias0x5 = _mm_loadu_si32((const int32_t*) w + 5);
-    __m256i vacc0x45 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x4), vbias0x5, 1);
-    const __m128i vbias0x6 = _mm_loadu_si32((const int32_t*) w + 6);
-    const __m128i vbias0x7 = _mm_loadu_si32((const int32_t*) w + 7);
-    __m256i vacc0x67 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x6), vbias0x7, 1);
-    __m256i vacc1x01 = vacc0x01;
-    __m256i vacc1x23 = vacc0x23;
-    __m256i vacc1x45 = vacc0x45;
-    __m256i vacc1x67 = vacc0x67;
-    w = (const void*) ((const int32_t*) w + 8);
-
-    size_t k = 0;
-    while (k < kc) {
-      const __m128i va0 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a0));
-      const __m256i vxa0 = _mm256_cvtepi8_epi16(va0);
-      a0 += 8;
-      const __m128i va1 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a1));
-      const __m256i vxa1 = _mm256_cvtepi8_epi16(va1);
-      a1 += 8;
-
-      const __m256i vxb01 = _mm256_load_si256((const __m256i*) w);
-
-      vacc0x01 = _mm256_add_epi32(vacc0x01, _mm256_madd_epi16(vxa0, vxb01));
-      vacc1x01 = _mm256_add_epi32(vacc1x01, _mm256_madd_epi16(vxa1, vxb01));
-      const __m256i vxb23 = _mm256_load_si256((const __m256i*) ((const int16_t*) w + 16));
-
-      vacc0x23 = _mm256_add_epi32(vacc0x23, _mm256_madd_epi16(vxa0, vxb23));
-      vacc1x23 = _mm256_add_epi32(vacc1x23, _mm256_madd_epi16(vxa1, vxb23));
-      const __m256i vxb45 = _mm256_load_si256((const __m256i*) ((const int16_t*) w + 32));
-
-      vacc0x45 = _mm256_add_epi32(vacc0x45, _mm256_madd_epi16(vxa0, vxb45));
-      vacc1x45 = _mm256_add_epi32(vacc1x45, _mm256_madd_epi16(vxa1, vxb45));
-      const __m256i vxb67 = _mm256_load_si256((const __m256i*) ((const int16_t*) w + 48));
-
-      vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67));
-      vacc1x67 = _mm256_add_epi32(vacc1x67, _mm256_madd_epi16(vxa1, vxb67));
-
-      w = (const void*) ((const int16_t*) w + 64);
-      k += 8 * sizeof(int8_t);
-    }
-
-    const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23);
-    const __m256i vacc0x4657 = _mm256_hadd_epi32(vacc0x45, vacc0x67);
-    const __m256i vacc1x0213 = _mm256_hadd_epi32(vacc1x01, vacc1x23);
-    const __m256i vacc1x4657 = _mm256_hadd_epi32(vacc1x45, vacc1x67);
-
-    const __m256i vacc0x02461357 = _mm256_hadd_epi32(vacc0x0213, vacc0x4657);
-    const __m256i vacc1x02461357 = _mm256_hadd_epi32(vacc1x0213, vacc1x4657);
-
-    const __m256i vpermute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
-    __m256i vacc0x01234567 = _mm256_permutevar8x32_epi32(vacc0x02461357, vpermute_mask);
-    __m256i vacc1x01234567 = _mm256_permutevar8x32_epi32(vacc1x02461357, vpermute_mask);
-
-    const __m256i vmultiplier = _mm256_load_si256((const __m256i*) params->gemmlowp_avx2.multiplier);
-    const __m256i vrounding = _mm256_load_si256((const __m256i*) params->gemmlowp_avx2.rounding);
-
-    const __m256i vacc0x11335577 = _mm256_srli_epi64(vacc0x01234567, 32);
-    const __m256i vacc1x11335577 = _mm256_srli_epi64(vacc1x01234567, 32);
-
-    const __m256i vprod0x0246 = _mm256_add_epi64(_mm256_mul_epi32(vacc0x01234567, vmultiplier), vrounding);
-    const __m256i vprod1x0246 = _mm256_add_epi64(_mm256_mul_epi32(vacc1x01234567, vmultiplier), vrounding);
-
-    const __m256i vprod0x1357 = _mm256_add_epi64(_mm256_mul_epi32(vacc0x11335577, vmultiplier), vrounding);
-    const __m256i vprod1x1357 = _mm256_add_epi64(_mm256_mul_epi32(vacc1x11335577, vmultiplier), vrounding);
-
-    const __m256i vq31prod0x0246 = _mm256_srli_epi64(vprod0x0246, 31);
-    const __m256i vq31prod0x1357 = _mm256_add_epi64(vprod0x1357, vprod0x1357);
-    const __m256i vq31prod1x0246 = _mm256_srli_epi64(vprod1x0246, 31);
-    const __m256i vq31prod1x1357 = _mm256_add_epi64(vprod1x1357, vprod1x1357);
-
-    const __m256i vq31prod0x01234567 = _mm256_blend_epi16(vq31prod0x0246, vq31prod0x1357, 0xCC);
-    const __m256i vq31prod1x01234567 = _mm256_blend_epi16(vq31prod1x0246, vq31prod1x1357, 0xCC);
-
-    const __m256i vremainder_mask = _mm256_load_si256((const __m256i*) params->gemmlowp_avx2.remainder_mask);
-    const __m256i vrem0x01234567 =
-      _mm256_add_epi32(_mm256_and_si256(vq31prod0x01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod0x01234567));
-    const __m256i vrem1x01234567 =
-      _mm256_add_epi32(_mm256_and_si256(vq31prod1x01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod1x01234567));
-
-    const __m256i vremainder_threshold = _mm256_load_si256((const __m256i*) params->gemmlowp_avx2.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->gemmlowp_avx2.shift);
-    vacc0x01234567 =
-      _mm256_sub_epi32(_mm256_sra_epi32(vq31prod0x01234567, vshift), _mm256_cmpgt_epi32(vrem0x01234567, vremainder_threshold));
-    vacc1x01234567 =
-      _mm256_sub_epi32(_mm256_sra_epi32(vq31prod1x01234567, vshift), _mm256_cmpgt_epi32(vrem1x01234567, vremainder_threshold));
-
-    const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->gemmlowp_avx2.output_zero_point);
-    __m256i vacc01x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc0x01234567, vacc1x01234567), voutput_zero_point);
-
-    vacc01x01234567 = _mm256_permute4x64_epi64(vacc01x01234567, _MM_SHUFFLE(3, 1, 2, 0));
-
-    __m256i vout = _mm256_packs_epi16(vacc01x01234567, vacc01x01234567);
-
-    vout = _mm256_max_epi8(vout, _mm256_load_si256((const __m256i*) params->gemmlowp_avx2.output_min));
-    vout = _mm256_min_epi8(vout, _mm256_load_si256((const __m256i*) params->gemmlowp_avx2.output_max));
-
-    __m128i vout_lo = _mm256_castsi256_si128(vout);
-    __m128i vout_hi = _mm256_extracti128_si256(vout, 1);
-
-    if (nc >= 8) {
-      _mm_storel_epi64((__m128i*) c0, vout_lo);
-      _mm_storel_epi64((__m128i*) c1, vout_hi);
-
-      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
-      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
-
-      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
-
-      nc -= 8;
-    } else {
-      if (nc & 4) {
-        _mm_storeu_si32(c0, vout_lo);
-        _mm_storeu_si32(c1, vout_hi);
-
-        c0 += 4;
-        c1 += 4;
-
-        vout_lo = _mm_srli_epi64(vout_lo, 32);
-        vout_hi = _mm_srli_epi64(vout_hi, 32);
-      }
-      if (nc & 2) {
-        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout_lo, 0);
-        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout_hi, 0);
-
-        c0 += 2;
-        c1 += 2;
-
-        vout_lo = _mm_srli_epi32(vout_lo, 16);
-        vout_hi = _mm_srli_epi32(vout_hi, 16);
-      }
-      if (nc & 1) {
-        *c0 = (int8_t) _mm_extract_epi8(vout_lo, 0);
-        *c1 = (int8_t) _mm_extract_epi8(vout_hi, 0);
-      }
-
-      nc = 0;
-    }
-  } while (nc != 0);
-}
diff --git a/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-sse41.c b/src/qs8-gemm/gen/3x4c2-xw-minmax-fp32-avx.c
similarity index 71%
rename from src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-sse41.c
rename to src/qs8-gemm/gen/3x4c2-xw-minmax-fp32-avx.c
index dfdcc16..a85b5ec 100644
--- a/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-sse41.c
+++ b/src/qs8-gemm/gen/3x4c2-xw-minmax-fp32-avx.c
@@ -16,7 +16,7 @@
 
 
 
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c2__sse41(
+void xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__avx(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -153,58 +153,28 @@
       }
     }
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.rounding);
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+    __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
 
-    const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
-    const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
-    const __m128i vacc2x1133 = _mm_shuffle_epi32(vacc2x0123, _MM_SHUFFLE(3, 3, 1, 1));
+    const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale);
+    vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale);
 
-    const __m128i vprod0x02 = _mm_add_epi64(_mm_mul_epi32(vacc0x0123, vmultiplier), vrounding);
-    const __m128i vprod1x02 = _mm_add_epi64(_mm_mul_epi32(vacc1x0123, vmultiplier), vrounding);
-    const __m128i vprod2x02 = _mm_add_epi64(_mm_mul_epi32(vacc2x0123, vmultiplier), vrounding);
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+    vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
 
-    const __m128i vprod0x13 = _mm_add_epi64(_mm_mul_epi32(vacc0x1133, vmultiplier), vrounding);
-    const __m128i vprod1x13 = _mm_add_epi64(_mm_mul_epi32(vacc1x1133, vmultiplier), vrounding);
-    const __m128i vprod2x13 = _mm_add_epi64(_mm_mul_epi32(vacc2x1133, vmultiplier), vrounding);
-
-    const __m128i vq31prod0x02 = _mm_srli_epi64(vprod0x02, 31);
-    const __m128i vq31prod0x13 = _mm_add_epi64(vprod0x13, vprod0x13);
-    const __m128i vq31prod1x02 = _mm_srli_epi64(vprod1x02, 31);
-    const __m128i vq31prod1x13 = _mm_add_epi64(vprod1x13, vprod1x13);
-    const __m128i vq31prod2x02 = _mm_srli_epi64(vprod2x02, 31);
-    const __m128i vq31prod2x13 = _mm_add_epi64(vprod2x13, vprod2x13);
-
-    const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
-    const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
-    const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
-
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_mask);
-    const __m128i vrem0x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
-    const __m128i vrem1x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
-    const __m128i vrem2x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
-
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_threshold);
-    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->gemmlowp_sse4.shift);
-    vacc0x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
-    vacc1x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
-    vacc2x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod2x0123, vshift), _mm_cmpgt_epi32(vrem2x0123, vremainder_threshold));
-
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
     __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
 
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
 
-    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
-    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_max));
 
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
diff --git a/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-sse41.c b/src/qs8-gemm/gen/3x4c2-xw-minmax-fp32-sse2.c
similarity index 64%
copy from src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-sse41.c
copy to src/qs8-gemm/gen/3x4c2-xw-minmax-fp32-sse2.c
index dfdcc16..b5680dc 100644
--- a/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-sse41.c
+++ b/src/qs8-gemm/gen/3x4c2-xw-minmax-fp32-sse2.c
@@ -9,14 +9,14 @@
 
 #include <assert.h>
 
-#include <smmintrin.h>
+#include <emmintrin.h>
 
 #include <xnnpack/gemm.h>
 #include <xnnpack/math.h>
 
 
 
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c2__sse41(
+void xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse2(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -62,13 +62,13 @@
     size_t k = kc;
     while (k >= 8 * sizeof(int8_t)) {
       const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
-      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8);
       a0 += 8;
       const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
-      const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+      const __m128i vxa1 = _mm_srai_epi16(_mm_unpacklo_epi8(va1, va1), 8);
       a1 += 8;
       const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
-      const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
+      const __m128i vxa2 = _mm_srai_epi16(_mm_unpacklo_epi8(va2, va2), 8);
       a2 += 8;
 
       const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
@@ -109,13 +109,13 @@
     }
     if (k != 0) {
       const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
-      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8);
       a0 = (const int8_t*) ((uintptr_t) a0 + k);
       const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
-      const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+      const __m128i vxa1 = _mm_srai_epi16(_mm_unpacklo_epi8(va1, va1), 8);
       a1 = (const int8_t*) ((uintptr_t) a1 + k);
       const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
-      const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
+      const __m128i vxa2 = _mm_srai_epi16(_mm_unpacklo_epi8(va2, va2), 8);
       a2 = (const int8_t*) ((uintptr_t) a2 + k);
 
       const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
@@ -153,63 +153,37 @@
       }
     }
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.rounding);
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+    __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
 
-    const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
-    const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
-    const __m128i vacc2x1133 = _mm_shuffle_epi32(vacc2x0123, _MM_SHUFFLE(3, 3, 1, 1));
+    const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale);
+    vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale);
 
-    const __m128i vprod0x02 = _mm_add_epi64(_mm_mul_epi32(vacc0x0123, vmultiplier), vrounding);
-    const __m128i vprod1x02 = _mm_add_epi64(_mm_mul_epi32(vacc1x0123, vmultiplier), vrounding);
-    const __m128i vprod2x02 = _mm_add_epi64(_mm_mul_epi32(vacc2x0123, vmultiplier), vrounding);
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+    vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
 
-    const __m128i vprod0x13 = _mm_add_epi64(_mm_mul_epi32(vacc0x1133, vmultiplier), vrounding);
-    const __m128i vprod1x13 = _mm_add_epi64(_mm_mul_epi32(vacc1x1133, vmultiplier), vrounding);
-    const __m128i vprod2x13 = _mm_add_epi64(_mm_mul_epi32(vacc2x1133, vmultiplier), vrounding);
-
-    const __m128i vq31prod0x02 = _mm_srli_epi64(vprod0x02, 31);
-    const __m128i vq31prod0x13 = _mm_add_epi64(vprod0x13, vprod0x13);
-    const __m128i vq31prod1x02 = _mm_srli_epi64(vprod1x02, 31);
-    const __m128i vq31prod1x13 = _mm_add_epi64(vprod1x13, vprod1x13);
-    const __m128i vq31prod2x02 = _mm_srli_epi64(vprod2x02, 31);
-    const __m128i vq31prod2x13 = _mm_add_epi64(vprod2x13, vprod2x13);
-
-    const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
-    const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
-    const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
-
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_mask);
-    const __m128i vrem0x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
-    const __m128i vrem1x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
-    const __m128i vrem2x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
-
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_threshold);
-    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->gemmlowp_sse4.shift);
-    vacc0x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
-    vacc1x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
-    vacc2x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod2x0123, vshift), _mm_cmpgt_epi32(vrem2x0123, vremainder_threshold));
-
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
     __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
 
+    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
+    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->fp32_sse2.output_max);
+    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
+    vacc22x0123 = _mm_min_epi16(_mm_max_epi16(vacc22x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
 
-    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
-    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
 
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
-      *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
-      *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
+      vout = _mm_srli_si128(vout, 4);
+      *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(vout);
+      vout = _mm_srli_si128(vout, 4);
+      *((uint32_t*) c2) = (uint32_t) _mm_cvtsi128_si32(vout);
 
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
@@ -231,9 +205,9 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
-        *c1 = (int8_t) _mm_extract_epi8(vout, 4);
-        *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+        *c0 = (int8_t) _mm_cvtsi128_si32(vout);
+        *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+        *c2 = (int8_t) _mm_extract_epi16(vout, 4);
       }
 
       nc = 0;
diff --git a/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-sse41.c b/src/qs8-gemm/gen/3x4c2-xw-minmax-fp32-sse41.c
similarity index 71%
copy from src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-sse41.c
copy to src/qs8-gemm/gen/3x4c2-xw-minmax-fp32-sse41.c
index dfdcc16..813025a 100644
--- a/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-sse41.c
+++ b/src/qs8-gemm/gen/3x4c2-xw-minmax-fp32-sse41.c
@@ -16,7 +16,7 @@
 
 
 
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c2__sse41(
+void xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse41(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -153,58 +153,28 @@
       }
     }
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.rounding);
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+    __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
 
-    const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
-    const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
-    const __m128i vacc2x1133 = _mm_shuffle_epi32(vacc2x0123, _MM_SHUFFLE(3, 3, 1, 1));
+    const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale);
+    vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale);
 
-    const __m128i vprod0x02 = _mm_add_epi64(_mm_mul_epi32(vacc0x0123, vmultiplier), vrounding);
-    const __m128i vprod1x02 = _mm_add_epi64(_mm_mul_epi32(vacc1x0123, vmultiplier), vrounding);
-    const __m128i vprod2x02 = _mm_add_epi64(_mm_mul_epi32(vacc2x0123, vmultiplier), vrounding);
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+    vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
 
-    const __m128i vprod0x13 = _mm_add_epi64(_mm_mul_epi32(vacc0x1133, vmultiplier), vrounding);
-    const __m128i vprod1x13 = _mm_add_epi64(_mm_mul_epi32(vacc1x1133, vmultiplier), vrounding);
-    const __m128i vprod2x13 = _mm_add_epi64(_mm_mul_epi32(vacc2x1133, vmultiplier), vrounding);
-
-    const __m128i vq31prod0x02 = _mm_srli_epi64(vprod0x02, 31);
-    const __m128i vq31prod0x13 = _mm_add_epi64(vprod0x13, vprod0x13);
-    const __m128i vq31prod1x02 = _mm_srli_epi64(vprod1x02, 31);
-    const __m128i vq31prod1x13 = _mm_add_epi64(vprod1x13, vprod1x13);
-    const __m128i vq31prod2x02 = _mm_srli_epi64(vprod2x02, 31);
-    const __m128i vq31prod2x13 = _mm_add_epi64(vprod2x13, vprod2x13);
-
-    const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
-    const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
-    const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
-
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_mask);
-    const __m128i vrem0x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
-    const __m128i vrem1x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
-    const __m128i vrem2x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
-
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_threshold);
-    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->gemmlowp_sse4.shift);
-    vacc0x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
-    vacc1x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
-    vacc2x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod2x0123, vshift), _mm_cmpgt_epi32(vrem2x0123, vremainder_threshold));
-
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
     __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
 
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
 
-    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
-    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_max));
 
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
diff --git a/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-ssse3.c b/src/qs8-gemm/gen/3x4c2-xw-minmax-fp32-ssse3.c
similarity index 100%
rename from src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-ssse3.c
rename to src/qs8-gemm/gen/3x4c2-xw-minmax-fp32-ssse3.c
diff --git a/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-xop.c b/src/qs8-gemm/gen/3x4c2-xw-minmax-fp32-xop.c
similarity index 70%
rename from src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-xop.c
rename to src/qs8-gemm/gen/3x4c2-xw-minmax-fp32-xop.c
index 3312851..55557ae 100644
--- a/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-xop.c
+++ b/src/qs8-gemm/gen/3x4c2-xw-minmax-fp32-xop.c
@@ -21,7 +21,7 @@
 
 
 
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c2__xop(
+void xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__xop(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -158,58 +158,28 @@
       }
     }
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.rounding);
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+    __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
 
-    const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
-    const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
-    const __m128i vacc2x1133 = _mm_shuffle_epi32(vacc2x0123, _MM_SHUFFLE(3, 3, 1, 1));
+    const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale);
+    vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale);
 
-    const __m128i vprod0x02 = _mm_add_epi64(_mm_mul_epi32(vacc0x0123, vmultiplier), vrounding);
-    const __m128i vprod1x02 = _mm_add_epi64(_mm_mul_epi32(vacc1x0123, vmultiplier), vrounding);
-    const __m128i vprod2x02 = _mm_add_epi64(_mm_mul_epi32(vacc2x0123, vmultiplier), vrounding);
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+    vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
 
-    const __m128i vprod0x13 = _mm_add_epi64(_mm_mul_epi32(vacc0x1133, vmultiplier), vrounding);
-    const __m128i vprod1x13 = _mm_add_epi64(_mm_mul_epi32(vacc1x1133, vmultiplier), vrounding);
-    const __m128i vprod2x13 = _mm_add_epi64(_mm_mul_epi32(vacc2x1133, vmultiplier), vrounding);
-
-    const __m128i vq31prod0x02 = _mm_srli_epi64(vprod0x02, 31);
-    const __m128i vq31prod0x13 = _mm_add_epi64(vprod0x13, vprod0x13);
-    const __m128i vq31prod1x02 = _mm_srli_epi64(vprod1x02, 31);
-    const __m128i vq31prod1x13 = _mm_add_epi64(vprod1x13, vprod1x13);
-    const __m128i vq31prod2x02 = _mm_srli_epi64(vprod2x02, 31);
-    const __m128i vq31prod2x13 = _mm_add_epi64(vprod2x13, vprod2x13);
-
-    const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
-    const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
-    const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
-
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_mask);
-    const __m128i vrem0x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
-    const __m128i vrem1x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
-    const __m128i vrem2x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
-
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_threshold);
-    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->gemmlowp_sse4.shift);
-    vacc0x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
-    vacc1x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
-    vacc2x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod2x0123, vshift), _mm_cmpgt_epi32(vrem2x0123, vremainder_threshold));
-
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
     __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
 
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
 
-    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
-    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_max));
 
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
diff --git a/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-avx.c b/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-avx.c
deleted file mode 100644
index 84e896d..0000000
--- a/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-avx.c
+++ /dev/null
@@ -1,242 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/qs8-gemm/MRx4c2-sse.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <smmintrin.h>
-
-#include <xnnpack/gemm.h>
-#include <xnnpack/math.h>
-
-
-
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c2__avx(
-    size_t mr,
-    size_t nc,
-    size_t kc,
-    const int8_t* restrict a,
-    size_t a_stride,
-    const void* restrict w,
-    int8_t* restrict c,
-    size_t cm_stride,
-    size_t cn_stride,
-    const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
-{
-  assert(mr != 0);
-  assert(mr <= 3);
-  assert(nc != 0);
-  assert(kc != 0);
-  assert(kc % sizeof(int8_t) == 0);
-  assert(a != NULL);
-  assert(w != NULL);
-  assert(c != NULL);
-
-  kc = round_up_po2(kc, 2);
-  const int8_t* a0 = a;
-  int8_t* c0 = c;
-  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
-  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
-  if XNN_UNPREDICTABLE(mr < 2) {
-    a1 = a0;
-    c1 = c0;
-  }
-  const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
-  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
-  if XNN_UNPREDICTABLE(mr <= 2) {
-    a2 = a1;
-    c2 = c1;
-  }
-
-  do {
-    __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
-    __m128i vacc1x0123 = vacc0x0123;
-    __m128i vacc2x0123 = vacc0x0123;
-    w = (const void*) ((const int32_t*) w + 4);
-
-    size_t k = kc;
-    while (k >= 8 * sizeof(int8_t)) {
-      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
-      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
-      a0 += 8;
-      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
-      const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
-      a1 += 8;
-      const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
-      const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
-      a2 += 8;
-
-      const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
-
-      vacc0x0123 = _mm_add_epi32(vacc0x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
-      vacc1x0123 = _mm_add_epi32(vacc1x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
-      vacc2x0123 = _mm_add_epi32(vacc2x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
-      const __m128i vxb1 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 8));
-
-      vacc0x0123 = _mm_add_epi32(vacc0x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
-      vacc1x0123 = _mm_add_epi32(vacc1x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
-      vacc2x0123 = _mm_add_epi32(vacc2x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
-      const __m128i vxb2 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 16));
-
-      vacc0x0123 = _mm_add_epi32(vacc0x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-      vacc1x0123 = _mm_add_epi32(vacc1x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-      vacc2x0123 = _mm_add_epi32(vacc2x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-      const __m128i vxb3 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 24));
-
-      vacc0x0123 = _mm_add_epi32(vacc0x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-      vacc1x0123 = _mm_add_epi32(vacc1x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-      vacc2x0123 = _mm_add_epi32(vacc2x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-
-      w = (const void*) ((const int16_t*) w + 32);
-      k -= 8 * sizeof(int8_t);
-    }
-    if (k != 0) {
-      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
-      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
-      a0 = (const int8_t*) ((uintptr_t) a0 + k);
-      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
-      const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
-      a1 = (const int8_t*) ((uintptr_t) a1 + k);
-      const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
-      const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
-      a2 = (const int8_t*) ((uintptr_t) a2 + k);
-
-      const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
-      w = (const void*) ((const int16_t*) w + 8);
-
-      vacc0x0123 = _mm_add_epi32(vacc0x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
-      vacc1x0123 = _mm_add_epi32(vacc1x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
-      vacc2x0123 = _mm_add_epi32(vacc2x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
-
-      if (k > 2 * sizeof(int8_t)) {
-        const __m128i vxb1 = _mm_load_si128((const __m128i*) w);
-        w = (const void*) ((const int16_t*) w + 8);
-
-        vacc0x0123 = _mm_add_epi32(vacc0x0123,
-          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
-        vacc1x0123 = _mm_add_epi32(vacc1x0123,
-          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
-        vacc2x0123 = _mm_add_epi32(vacc2x0123,
-          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
-
-        if (k > 4 * sizeof(int8_t)) {
-          const __m128i vxb2 = _mm_load_si128((const __m128i*) w);
-          w = (const void*) ((const int16_t*) w + 8);
-
-          vacc0x0123 = _mm_add_epi32(vacc0x0123,
-            _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-          vacc1x0123 = _mm_add_epi32(vacc1x0123,
-            _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-          vacc2x0123 = _mm_add_epi32(vacc2x0123,
-            _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-        }
-      }
-    }
-
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.rounding);
-
-    const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
-    const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
-    const __m128i vacc2x1133 = _mm_shuffle_epi32(vacc2x0123, _MM_SHUFFLE(3, 3, 1, 1));
-
-    const __m128i vprod0x02 = _mm_add_epi64(_mm_mul_epi32(vacc0x0123, vmultiplier), vrounding);
-    const __m128i vprod1x02 = _mm_add_epi64(_mm_mul_epi32(vacc1x0123, vmultiplier), vrounding);
-    const __m128i vprod2x02 = _mm_add_epi64(_mm_mul_epi32(vacc2x0123, vmultiplier), vrounding);
-
-    const __m128i vprod0x13 = _mm_add_epi64(_mm_mul_epi32(vacc0x1133, vmultiplier), vrounding);
-    const __m128i vprod1x13 = _mm_add_epi64(_mm_mul_epi32(vacc1x1133, vmultiplier), vrounding);
-    const __m128i vprod2x13 = _mm_add_epi64(_mm_mul_epi32(vacc2x1133, vmultiplier), vrounding);
-
-    const __m128i vq31prod0x02 = _mm_srli_epi64(vprod0x02, 31);
-    const __m128i vq31prod0x13 = _mm_add_epi64(vprod0x13, vprod0x13);
-    const __m128i vq31prod1x02 = _mm_srli_epi64(vprod1x02, 31);
-    const __m128i vq31prod1x13 = _mm_add_epi64(vprod1x13, vprod1x13);
-    const __m128i vq31prod2x02 = _mm_srli_epi64(vprod2x02, 31);
-    const __m128i vq31prod2x13 = _mm_add_epi64(vprod2x13, vprod2x13);
-
-    const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
-    const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
-    const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
-
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_mask);
-    const __m128i vrem0x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
-    const __m128i vrem1x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
-    const __m128i vrem2x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
-
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_threshold);
-    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->gemmlowp_sse4.shift);
-    vacc0x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
-    vacc1x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
-    vacc2x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod2x0123, vshift), _mm_cmpgt_epi32(vrem2x0123, vremainder_threshold));
-
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_zero_point);
-    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
-    __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
-
-
-    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
-
-    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
-    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
-
-    if (nc >= 4) {
-      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
-      *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
-      *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
-
-      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
-      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
-      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
-
-      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
-      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
-
-      nc -= 4;
-    } else {
-      if (nc & 2) {
-        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
-        c0 += 2;
-        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
-        c1 += 2;
-        *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4);
-        c2 += 2;
-        vout = _mm_srli_epi32(vout, 16);
-      }
-      if (nc & 1) {
-        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
-        *c1 = (int8_t) _mm_extract_epi8(vout, 4);
-        *c2 = (int8_t) _mm_extract_epi8(vout, 8);
-      }
-
-      nc = 0;
-    }
-  } while (nc != 0);
-}
diff --git a/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-sse2.c b/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-sse2.c
deleted file mode 100644
index ec7add0..0000000
--- a/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-sse2.c
+++ /dev/null
@@ -1,278 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/qs8-gemm/MRx4c2-sse.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <emmintrin.h>
-
-#include <xnnpack/gemm.h>
-#include <xnnpack/math.h>
-
-
-
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c2__sse2(
-    size_t mr,
-    size_t nc,
-    size_t kc,
-    const int8_t* restrict a,
-    size_t a_stride,
-    const void* restrict w,
-    int8_t* restrict c,
-    size_t cm_stride,
-    size_t cn_stride,
-    const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
-{
-  assert(mr != 0);
-  assert(mr <= 3);
-  assert(nc != 0);
-  assert(kc != 0);
-  assert(kc % sizeof(int8_t) == 0);
-  assert(a != NULL);
-  assert(w != NULL);
-  assert(c != NULL);
-
-  kc = round_up_po2(kc, 2);
-  const int8_t* a0 = a;
-  int8_t* c0 = c;
-  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
-  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
-  if XNN_UNPREDICTABLE(mr < 2) {
-    a1 = a0;
-    c1 = c0;
-  }
-  const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
-  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
-  if XNN_UNPREDICTABLE(mr <= 2) {
-    a2 = a1;
-    c2 = c1;
-  }
-
-  do {
-    __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
-    __m128i vacc1x0123 = vacc0x0123;
-    __m128i vacc2x0123 = vacc0x0123;
-    w = (const void*) ((const int32_t*) w + 4);
-
-    size_t k = kc;
-    while (k >= 8 * sizeof(int8_t)) {
-      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
-      const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8);
-      a0 += 8;
-      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
-      const __m128i vxa1 = _mm_srai_epi16(_mm_unpacklo_epi8(va1, va1), 8);
-      a1 += 8;
-      const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
-      const __m128i vxa2 = _mm_srai_epi16(_mm_unpacklo_epi8(va2, va2), 8);
-      a2 += 8;
-
-      const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
-
-      vacc0x0123 = _mm_add_epi32(vacc0x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
-      vacc1x0123 = _mm_add_epi32(vacc1x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
-      vacc2x0123 = _mm_add_epi32(vacc2x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
-      const __m128i vxb1 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 8));
-
-      vacc0x0123 = _mm_add_epi32(vacc0x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
-      vacc1x0123 = _mm_add_epi32(vacc1x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
-      vacc2x0123 = _mm_add_epi32(vacc2x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
-      const __m128i vxb2 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 16));
-
-      vacc0x0123 = _mm_add_epi32(vacc0x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-      vacc1x0123 = _mm_add_epi32(vacc1x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-      vacc2x0123 = _mm_add_epi32(vacc2x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-      const __m128i vxb3 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 24));
-
-      vacc0x0123 = _mm_add_epi32(vacc0x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-      vacc1x0123 = _mm_add_epi32(vacc1x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-      vacc2x0123 = _mm_add_epi32(vacc2x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-
-      w = (const void*) ((const int16_t*) w + 32);
-      k -= 8 * sizeof(int8_t);
-    }
-    if (k != 0) {
-      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
-      const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8);
-      a0 = (const int8_t*) ((uintptr_t) a0 + k);
-      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
-      const __m128i vxa1 = _mm_srai_epi16(_mm_unpacklo_epi8(va1, va1), 8);
-      a1 = (const int8_t*) ((uintptr_t) a1 + k);
-      const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
-      const __m128i vxa2 = _mm_srai_epi16(_mm_unpacklo_epi8(va2, va2), 8);
-      a2 = (const int8_t*) ((uintptr_t) a2 + k);
-
-      const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
-      w = (const void*) ((const int16_t*) w + 8);
-
-      vacc0x0123 = _mm_add_epi32(vacc0x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
-      vacc1x0123 = _mm_add_epi32(vacc1x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
-      vacc2x0123 = _mm_add_epi32(vacc2x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
-
-      if (k > 2 * sizeof(int8_t)) {
-        const __m128i vxb1 = _mm_load_si128((const __m128i*) w);
-        w = (const void*) ((const int16_t*) w + 8);
-
-        vacc0x0123 = _mm_add_epi32(vacc0x0123,
-          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
-        vacc1x0123 = _mm_add_epi32(vacc1x0123,
-          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
-        vacc2x0123 = _mm_add_epi32(vacc2x0123,
-          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
-
-        if (k > 4 * sizeof(int8_t)) {
-          const __m128i vxb2 = _mm_load_si128((const __m128i*) w);
-          w = (const void*) ((const int16_t*) w + 8);
-
-          vacc0x0123 = _mm_add_epi32(vacc0x0123,
-            _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-          vacc1x0123 = _mm_add_epi32(vacc1x0123,
-            _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-          vacc2x0123 = _mm_add_epi32(vacc2x0123,
-            _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-        }
-      }
-    }
-
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.rounding);
-
-    const __m128i vnmask0x0123 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc0x0123);
-    const __m128i vnmask1x0123 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc1x0123);
-    const __m128i vnmask2x0123 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc2x0123);
-
-    const __m128i vabsacc0x0123 = _mm_sub_epi32(_mm_xor_si128(vacc0x0123, vnmask0x0123), vnmask0x0123);
-    const __m128i vabsacc1x0123 = _mm_sub_epi32(_mm_xor_si128(vacc1x0123, vnmask1x0123), vnmask1x0123);
-    const __m128i vabsacc2x0123 = _mm_sub_epi32(_mm_xor_si128(vacc2x0123, vnmask2x0123), vnmask2x0123);
-
-    const __m128i vabsacc0x1133 = _mm_shuffle_epi32(vabsacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
-    const __m128i vabsacc1x1133 = _mm_shuffle_epi32(vabsacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
-    const __m128i vabsacc2x1133 = _mm_shuffle_epi32(vabsacc2x0123, _MM_SHUFFLE(3, 3, 1, 1));
-
-    const __m128i vabsprod0x02 = _mm_mul_epu32(vabsacc0x0123, vmultiplier);
-    const __m128i vabsprod1x02 = _mm_mul_epu32(vabsacc1x0123, vmultiplier);
-    const __m128i vabsprod2x02 = _mm_mul_epu32(vabsacc2x0123, vmultiplier);
-
-    const __m128i vnmask0x02 = _mm_shuffle_epi32(vnmask0x0123, _MM_SHUFFLE(2, 2, 0, 0));
-    const __m128i vnmask1x02 = _mm_shuffle_epi32(vnmask1x0123, _MM_SHUFFLE(2, 2, 0, 0));
-    const __m128i vnmask2x02 = _mm_shuffle_epi32(vnmask2x0123, _MM_SHUFFLE(2, 2, 0, 0));
-
-    const __m128i vprod0x02 = _mm_sub_epi64(_mm_xor_si128(vabsprod0x02, vnmask0x02), vnmask0x02);
-    const __m128i vprod1x02 = _mm_sub_epi64(_mm_xor_si128(vabsprod1x02, vnmask1x02), vnmask1x02);
-    const __m128i vprod2x02 = _mm_sub_epi64(_mm_xor_si128(vabsprod2x02, vnmask2x02), vnmask2x02);
-
-    const __m128i vq31prod0x02 = _mm_srli_epi64(_mm_add_epi64(vprod0x02, vrounding), 31);
-    const __m128i vq31prod1x02 = _mm_srli_epi64(_mm_add_epi64(vprod1x02, vrounding), 31);
-    const __m128i vq31prod2x02 = _mm_srli_epi64(_mm_add_epi64(vprod2x02, vrounding), 31);
-
-    const __m128i vabsprod0x13 = _mm_mul_epu32(vabsacc0x1133, vmultiplier);
-    const __m128i vabsprod1x13 = _mm_mul_epu32(vabsacc1x1133, vmultiplier);
-    const __m128i vabsprod2x13 = _mm_mul_epu32(vabsacc2x1133, vmultiplier);
-
-    const __m128i vnmask0x13 = _mm_shuffle_epi32(vnmask0x0123, _MM_SHUFFLE(3, 3, 1, 1));
-    const __m128i vnmask1x13 = _mm_shuffle_epi32(vnmask1x0123, _MM_SHUFFLE(3, 3, 1, 1));
-    const __m128i vnmask2x13 = _mm_shuffle_epi32(vnmask2x0123, _MM_SHUFFLE(3, 3, 1, 1));
-
-    const __m128i vprod0x13 = _mm_sub_epi64(_mm_xor_si128(vabsprod0x13, vnmask0x13), vnmask0x13);
-    const __m128i vprod1x13 = _mm_sub_epi64(_mm_xor_si128(vabsprod1x13, vnmask1x13), vnmask1x13);
-    const __m128i vprod2x13 = _mm_sub_epi64(_mm_xor_si128(vabsprod2x13, vnmask2x13), vnmask2x13);
-
-    const __m128i vq31prod0x13 = _mm_srli_epi64(_mm_add_epi64(vprod0x13, vrounding), 31);
-    const __m128i vq31prod1x13 = _mm_srli_epi64(_mm_add_epi64(vprod1x13, vrounding), 31);
-    const __m128i vq31prod2x13 = _mm_srli_epi64(_mm_add_epi64(vprod2x13, vrounding), 31);
-
-    const __m128i vq31prod0x0213 = _mm_castps_si128(_mm_shuffle_ps(
-        _mm_castsi128_ps(vq31prod0x02), _mm_castsi128_ps(vq31prod0x13), _MM_SHUFFLE(2, 0, 2, 0)));
-    const __m128i vq31prod1x0213 = _mm_castps_si128(_mm_shuffle_ps(
-        _mm_castsi128_ps(vq31prod1x02), _mm_castsi128_ps(vq31prod1x13), _MM_SHUFFLE(2, 0, 2, 0)));
-    const __m128i vq31prod2x0213 = _mm_castps_si128(_mm_shuffle_ps(
-        _mm_castsi128_ps(vq31prod2x02), _mm_castsi128_ps(vq31prod2x13), _MM_SHUFFLE(2, 0, 2, 0)));
-
-    const __m128i vq31prod0x0123 = _mm_shuffle_epi32(vq31prod0x0213, _MM_SHUFFLE(3, 1, 2, 0));
-    const __m128i vq31prod1x0123 = _mm_shuffle_epi32(vq31prod1x0213, _MM_SHUFFLE(3, 1, 2, 0));
-    const __m128i vq31prod2x0123 = _mm_shuffle_epi32(vq31prod2x0213, _MM_SHUFFLE(3, 1, 2, 0));
-
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.remainder_mask);
-    const __m128i vrem0x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
-    const __m128i vrem1x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
-    const __m128i vrem2x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
-
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.remainder_threshold);
-    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->gemmlowp_sse2.shift);
-    vacc0x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
-    vacc1x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
-    vacc2x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod2x0123, vshift), _mm_cmpgt_epi32(vrem2x0123, vremainder_threshold));
-
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_zero_point);
-    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
-    __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
-
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
-    vacc22x0123 = _mm_min_epi16(_mm_max_epi16(vacc22x0123, voutput_min), voutput_max);
-
-    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
-
-
-    if (nc >= 4) {
-      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
-      vout = _mm_srli_si128(vout, 4);
-      *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(vout);
-      vout = _mm_srli_si128(vout, 4);
-      *((uint32_t*) c2) = (uint32_t) _mm_cvtsi128_si32(vout);
-
-      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
-      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
-      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
-
-      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
-      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
-
-      nc -= 4;
-    } else {
-      if (nc & 2) {
-        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
-        c0 += 2;
-        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
-        c1 += 2;
-        *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4);
-        c2 += 2;
-        vout = _mm_srli_epi32(vout, 16);
-      }
-      if (nc & 1) {
-        *c0 = (int8_t) _mm_cvtsi128_si32(vout);
-        *c1 = (int8_t) _mm_extract_epi16(vout, 2);
-        *c2 = (int8_t) _mm_extract_epi16(vout, 4);
-      }
-
-      nc = 0;
-    }
-  } while (nc != 0);
-}
diff --git a/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-sse41.c b/src/qs8-gemm/gen/3x4c8-xw-minmax-fp32-avx.c
similarity index 65%
rename from src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-sse41.c
rename to src/qs8-gemm/gen/3x4c8-xw-minmax-fp32-avx.c
index fc8a9ec..bb698ba 100644
--- a/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-sse41.c
+++ b/src/qs8-gemm/gen/3x4c8-xw-minmax-fp32-avx.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse41(
+void xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__avx(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -115,58 +115,28 @@
     __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
     __m128i vacc2x0123 = _mm_hadd_epi32(vacc2x01, vacc2x23);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.rounding);
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+    __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
 
-    const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
-    const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
-    const __m128i vacc2x1133 = _mm_shuffle_epi32(vacc2x0123, _MM_SHUFFLE(3, 3, 1, 1));
+    const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale);
+    vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale);
 
-    const __m128i vprod0x02 = _mm_add_epi64(_mm_mul_epi32(vacc0x0123, vmultiplier), vrounding);
-    const __m128i vprod1x02 = _mm_add_epi64(_mm_mul_epi32(vacc1x0123, vmultiplier), vrounding);
-    const __m128i vprod2x02 = _mm_add_epi64(_mm_mul_epi32(vacc2x0123, vmultiplier), vrounding);
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+    vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
 
-    const __m128i vprod0x13 = _mm_add_epi64(_mm_mul_epi32(vacc0x1133, vmultiplier), vrounding);
-    const __m128i vprod1x13 = _mm_add_epi64(_mm_mul_epi32(vacc1x1133, vmultiplier), vrounding);
-    const __m128i vprod2x13 = _mm_add_epi64(_mm_mul_epi32(vacc2x1133, vmultiplier), vrounding);
-
-    const __m128i vq31prod0x02 = _mm_srli_epi64(vprod0x02, 31);
-    const __m128i vq31prod0x13 = _mm_add_epi64(vprod0x13, vprod0x13);
-    const __m128i vq31prod1x02 = _mm_srli_epi64(vprod1x02, 31);
-    const __m128i vq31prod1x13 = _mm_add_epi64(vprod1x13, vprod1x13);
-    const __m128i vq31prod2x02 = _mm_srli_epi64(vprod2x02, 31);
-    const __m128i vq31prod2x13 = _mm_add_epi64(vprod2x13, vprod2x13);
-
-    const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
-    const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
-    const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
-
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_mask);
-    const __m128i vrem0x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
-    const __m128i vrem1x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
-    const __m128i vrem2x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
-
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_threshold);
-    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->gemmlowp_sse4.shift);
-    vacc0x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
-    vacc1x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
-    vacc2x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod2x0123, vshift), _mm_cmpgt_epi32(vrem2x0123, vremainder_threshold));
-
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
     __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
 
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
 
-    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
-    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_max));
 
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
diff --git a/src/qs8-gemm/gen/3x4c8-xw-minmax-fp32-sse2.c b/src/qs8-gemm/gen/3x4c8-xw-minmax-fp32-sse2.c
new file mode 100644
index 0000000..7a167e1
--- /dev/null
+++ b/src/qs8-gemm/gen/3x4c8-xw-minmax-fp32-sse2.c
@@ -0,0 +1,178 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/MRx4c8-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse2(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 3);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+
+  do {
+    __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
+    __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
+    __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
+    __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
+    __m128i vacc1x0 = vacc0x0;
+    __m128i vacc1x1 = vacc0x1;
+    __m128i vacc1x2 = vacc0x2;
+    __m128i vacc1x3 = vacc0x3;
+    __m128i vacc2x0 = vacc0x0;
+    __m128i vacc2x1 = vacc0x1;
+    __m128i vacc2x2 = vacc0x2;
+    __m128i vacc2x3 = vacc0x3;
+    w = (const void*) ((const int32_t*) w + 4);
+
+    size_t k = 0;
+    while (k < kc) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8);
+      a0 += 8;
+      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+      const __m128i vxa1 = _mm_srai_epi16(_mm_unpacklo_epi8(va1, va1), 8);
+      a1 += 8;
+      const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+      const __m128i vxa2 = _mm_srai_epi16(_mm_unpacklo_epi8(va2, va2), 8);
+      a2 += 8;
+
+      const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
+
+      vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
+      vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
+      vacc2x0 = _mm_add_epi32(vacc2x0, _mm_madd_epi16(vxa2, vxb0));
+      const __m128i vxb1 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 8));
+
+      vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
+      vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
+      vacc2x1 = _mm_add_epi32(vacc2x1, _mm_madd_epi16(vxa2, vxb1));
+      const __m128i vxb2 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 16));
+
+      vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
+      vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
+      vacc2x2 = _mm_add_epi32(vacc2x2, _mm_madd_epi16(vxa2, vxb2));
+      const __m128i vxb3 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 24));
+
+      vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
+      vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
+      vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3));
+
+      w = (const void*) ((const int16_t*) w + 32);
+      k += 8 * sizeof(int8_t);
+    }
+
+    const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2));
+    const __m128i vacc0x13 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x1, vacc0x3), _mm_unpackhi_epi32(vacc0x1, vacc0x3));
+    const __m128i vacc1x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc1x0, vacc1x2), _mm_unpackhi_epi32(vacc1x0, vacc1x2));
+    const __m128i vacc1x13 = _mm_add_epi32(_mm_unpacklo_epi32(vacc1x1, vacc1x3), _mm_unpackhi_epi32(vacc1x1, vacc1x3));
+    const __m128i vacc2x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc2x0, vacc2x2), _mm_unpackhi_epi32(vacc2x0, vacc2x2));
+    const __m128i vacc2x13 = _mm_add_epi32(_mm_unpacklo_epi32(vacc2x1, vacc2x3), _mm_unpackhi_epi32(vacc2x1, vacc2x3));
+
+    __m128i vacc0x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x02, vacc0x13), _mm_unpackhi_epi32(vacc0x02, vacc0x13));
+    __m128i vacc1x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc1x02, vacc1x13), _mm_unpackhi_epi32(vacc1x02, vacc1x13));
+    __m128i vacc2x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc2x02, vacc2x13), _mm_unpackhi_epi32(vacc2x02, vacc2x13));
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+    __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
+
+    const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale);
+    vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+    vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+    __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
+
+    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
+    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->fp32_sse2.output_max);
+    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
+    vacc22x0123 = _mm_min_epi16(_mm_max_epi16(vacc22x0123, voutput_min), voutput_max);
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
+
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      vout = _mm_srli_si128(vout, 4);
+      *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(vout);
+      vout = _mm_srli_si128(vout, 4);
+      *((uint32_t*) c2) = (uint32_t) _mm_cvtsi128_si32(vout);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4);
+        c2 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *c0 = (int8_t) _mm_cvtsi128_si32(vout);
+        *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+        *c2 = (int8_t) _mm_extract_epi16(vout, 4);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-sse41.c b/src/qs8-gemm/gen/3x4c8-xw-minmax-fp32-sse41.c
similarity index 65%
copy from src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-sse41.c
copy to src/qs8-gemm/gen/3x4c8-xw-minmax-fp32-sse41.c
index fc8a9ec..82ce7ce 100644
--- a/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-sse41.c
+++ b/src/qs8-gemm/gen/3x4c8-xw-minmax-fp32-sse41.c
@@ -15,7 +15,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse41(
+void xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse41(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -115,58 +115,28 @@
     __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
     __m128i vacc2x0123 = _mm_hadd_epi32(vacc2x01, vacc2x23);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.rounding);
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+    __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
 
-    const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
-    const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
-    const __m128i vacc2x1133 = _mm_shuffle_epi32(vacc2x0123, _MM_SHUFFLE(3, 3, 1, 1));
+    const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale);
+    vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale);
 
-    const __m128i vprod0x02 = _mm_add_epi64(_mm_mul_epi32(vacc0x0123, vmultiplier), vrounding);
-    const __m128i vprod1x02 = _mm_add_epi64(_mm_mul_epi32(vacc1x0123, vmultiplier), vrounding);
-    const __m128i vprod2x02 = _mm_add_epi64(_mm_mul_epi32(vacc2x0123, vmultiplier), vrounding);
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+    vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
 
-    const __m128i vprod0x13 = _mm_add_epi64(_mm_mul_epi32(vacc0x1133, vmultiplier), vrounding);
-    const __m128i vprod1x13 = _mm_add_epi64(_mm_mul_epi32(vacc1x1133, vmultiplier), vrounding);
-    const __m128i vprod2x13 = _mm_add_epi64(_mm_mul_epi32(vacc2x1133, vmultiplier), vrounding);
-
-    const __m128i vq31prod0x02 = _mm_srli_epi64(vprod0x02, 31);
-    const __m128i vq31prod0x13 = _mm_add_epi64(vprod0x13, vprod0x13);
-    const __m128i vq31prod1x02 = _mm_srli_epi64(vprod1x02, 31);
-    const __m128i vq31prod1x13 = _mm_add_epi64(vprod1x13, vprod1x13);
-    const __m128i vq31prod2x02 = _mm_srli_epi64(vprod2x02, 31);
-    const __m128i vq31prod2x13 = _mm_add_epi64(vprod2x13, vprod2x13);
-
-    const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
-    const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
-    const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
-
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_mask);
-    const __m128i vrem0x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
-    const __m128i vrem1x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
-    const __m128i vrem2x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
-
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_threshold);
-    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->gemmlowp_sse4.shift);
-    vacc0x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
-    vacc1x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
-    vacc2x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod2x0123, vshift), _mm_cmpgt_epi32(vrem2x0123, vremainder_threshold));
-
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
     __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
 
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
 
-    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
-    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_max));
 
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
diff --git a/src/qs8-gemm/gen/3x4c8-xw-minmax-fp32-ssse3.c b/src/qs8-gemm/gen/3x4c8-xw-minmax-fp32-ssse3.c
new file mode 100644
index 0000000..e0403c7
--- /dev/null
+++ b/src/qs8-gemm/gen/3x4c8-xw-minmax-fp32-ssse3.c
@@ -0,0 +1,178 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/MRx4c8-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <tmmintrin.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__ssse3(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 3);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+
+  do {
+    __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
+    __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
+    __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
+    __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
+    __m128i vacc1x0 = vacc0x0;
+    __m128i vacc1x1 = vacc0x1;
+    __m128i vacc1x2 = vacc0x2;
+    __m128i vacc1x3 = vacc0x3;
+    __m128i vacc2x0 = vacc0x0;
+    __m128i vacc2x1 = vacc0x1;
+    __m128i vacc2x2 = vacc0x2;
+    __m128i vacc2x3 = vacc0x3;
+    w = (const void*) ((const int32_t*) w + 4);
+
+    size_t k = 0;
+    while (k < kc) {
+      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+      const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8);
+      a0 += 8;
+      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+      const __m128i vxa1 = _mm_srai_epi16(_mm_unpacklo_epi8(va1, va1), 8);
+      a1 += 8;
+      const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+      const __m128i vxa2 = _mm_srai_epi16(_mm_unpacklo_epi8(va2, va2), 8);
+      a2 += 8;
+
+      const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
+
+      vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
+      vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
+      vacc2x0 = _mm_add_epi32(vacc2x0, _mm_madd_epi16(vxa2, vxb0));
+      const __m128i vxb1 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 8));
+
+      vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
+      vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
+      vacc2x1 = _mm_add_epi32(vacc2x1, _mm_madd_epi16(vxa2, vxb1));
+      const __m128i vxb2 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 16));
+
+      vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
+      vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
+      vacc2x2 = _mm_add_epi32(vacc2x2, _mm_madd_epi16(vxa2, vxb2));
+      const __m128i vxb3 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 24));
+
+      vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
+      vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
+      vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3));
+
+      w = (const void*) ((const int16_t*) w + 32);
+      k += 8 * sizeof(int8_t);
+    }
+
+    const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
+    const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
+    const __m128i vacc1x01 = _mm_hadd_epi32(vacc1x0, vacc1x1);
+    const __m128i vacc1x23 = _mm_hadd_epi32(vacc1x2, vacc1x3);
+    const __m128i vacc2x01 = _mm_hadd_epi32(vacc2x0, vacc2x1);
+    const __m128i vacc2x23 = _mm_hadd_epi32(vacc2x2, vacc2x3);
+
+    __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
+    __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
+    __m128i vacc2x0123 = _mm_hadd_epi32(vacc2x01, vacc2x23);
+
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+    __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
+
+    const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale);
+    vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale);
+
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+    vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
+
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
+    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+    __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
+
+    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
+    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->fp32_sse2.output_max);
+    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
+    vacc22x0123 = _mm_min_epi16(_mm_max_epi16(vacc22x0123, voutput_min), voutput_max);
+
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
+
+
+    if (nc >= 4) {
+      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+      vout = _mm_srli_si128(vout, 4);
+      *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(vout);
+      vout = _mm_srli_si128(vout, 4);
+      *((uint32_t*) c2) = (uint32_t) _mm_cvtsi128_si32(vout);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+        c0 += 2;
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+        c1 += 2;
+        *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4);
+        c2 += 2;
+        vout = _mm_srli_epi32(vout, 16);
+      }
+      if (nc & 1) {
+        *c0 = (int8_t) _mm_cvtsi128_si32(vout);
+        *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+        *c2 = (int8_t) _mm_extract_epi16(vout, 4);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/3x4c8-xw-minmax-fp32-wasmsimd.c b/src/qs8-gemm/gen/3x4c8-xw-minmax-fp32-wasmsimd.c
new file mode 100644
index 0000000..f9b0a8e
--- /dev/null
+++ b/src/qs8-gemm/gen/3x4c8-xw-minmax-fp32-wasmsimd.c
@@ -0,0 +1,221 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/MRx4c8-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__wasmsimd(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+  assert(mr != 0);
+  assert(mr <= 3);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  kc = round_up_po2(kc, 8);
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+
+  const v128_t vzero = wasm_f64x2_splat(0.0);
+  do {
+    v128_t vacc0x0 = wasm_f32x4_replace_lane(vzero, 0, ((const float*) w)[0]);
+    v128_t vacc0x1 = wasm_f32x4_replace_lane(vzero, 0, ((const float*) w)[1]);
+    v128_t vacc0x2 = wasm_f32x4_replace_lane(vzero, 0, ((const float*) w)[2]);
+    v128_t vacc0x3 = wasm_f32x4_replace_lane(vzero, 0, ((const float*) w)[3]);
+    v128_t vacc1x0 = vacc0x0;
+    v128_t vacc1x1 = vacc0x1;
+    v128_t vacc1x2 = vacc0x2;
+    v128_t vacc1x3 = vacc0x3;
+    v128_t vacc2x0 = vacc0x0;
+    v128_t vacc2x1 = vacc0x1;
+    v128_t vacc2x2 = vacc0x2;
+    v128_t vacc2x3 = vacc0x3;
+    w = (const void*) ((const int32_t*) w + 4);
+
+    size_t k = 0;
+    while (k < kc) {
+      const v128_t vxa0 = wasm_i16x8_load8x8(a0);
+      a0 += 8;
+      const v128_t vxa1 = wasm_i16x8_load8x8(a1);
+      a1 += 8;
+      const v128_t vxa2 = wasm_i16x8_load8x8(a2);
+      a2 += 8;
+
+      const v128_t vxb0 = wasm_v128_load(w);
+
+      const v128_t vprod0x0 = wasm_i16x8_mul(vxa0, vxb0);
+      vacc0x0 = wasm_i32x4_add(vacc0x0, wasm_i32x4_extend_low_i16x8(vprod0x0));
+      vacc0x0 = wasm_i32x4_add(vacc0x0, wasm_i32x4_extend_high_i16x8(vprod0x0));
+      const v128_t vprod1x0 = wasm_i16x8_mul(vxa1, vxb0);
+      vacc1x0 = wasm_i32x4_add(vacc1x0, wasm_i32x4_extend_low_i16x8(vprod1x0));
+      vacc1x0 = wasm_i32x4_add(vacc1x0, wasm_i32x4_extend_high_i16x8(vprod1x0));
+      const v128_t vprod2x0 = wasm_i16x8_mul(vxa2, vxb0);
+      vacc2x0 = wasm_i32x4_add(vacc2x0, wasm_i32x4_extend_low_i16x8(vprod2x0));
+      vacc2x0 = wasm_i32x4_add(vacc2x0, wasm_i32x4_extend_high_i16x8(vprod2x0));
+      const v128_t vxb1 = wasm_v128_load((const int16_t*) w + 8);
+
+      const v128_t vprod0x1 = wasm_i16x8_mul(vxa0, vxb1);
+      vacc0x1 = wasm_i32x4_add(vacc0x1, wasm_i32x4_extend_low_i16x8(vprod0x1));
+      vacc0x1 = wasm_i32x4_add(vacc0x1, wasm_i32x4_extend_high_i16x8(vprod0x1));
+      const v128_t vprod1x1 = wasm_i16x8_mul(vxa1, vxb1);
+      vacc1x1 = wasm_i32x4_add(vacc1x1, wasm_i32x4_extend_low_i16x8(vprod1x1));
+      vacc1x1 = wasm_i32x4_add(vacc1x1, wasm_i32x4_extend_high_i16x8(vprod1x1));
+      const v128_t vprod2x1 = wasm_i16x8_mul(vxa2, vxb1);
+      vacc2x1 = wasm_i32x4_add(vacc2x1, wasm_i32x4_extend_low_i16x8(vprod2x1));
+      vacc2x1 = wasm_i32x4_add(vacc2x1, wasm_i32x4_extend_high_i16x8(vprod2x1));
+      const v128_t vxb2 = wasm_v128_load((const int16_t*) w + 16);
+
+      const v128_t vprod0x2 = wasm_i16x8_mul(vxa0, vxb2);
+      vacc0x2 = wasm_i32x4_add(vacc0x2, wasm_i32x4_extend_low_i16x8(vprod0x2));
+      vacc0x2 = wasm_i32x4_add(vacc0x2, wasm_i32x4_extend_high_i16x8(vprod0x2));
+      const v128_t vprod1x2 = wasm_i16x8_mul(vxa1, vxb2);
+      vacc1x2 = wasm_i32x4_add(vacc1x2, wasm_i32x4_extend_low_i16x8(vprod1x2));
+      vacc1x2 = wasm_i32x4_add(vacc1x2, wasm_i32x4_extend_high_i16x8(vprod1x2));
+      const v128_t vprod2x2 = wasm_i16x8_mul(vxa2, vxb2);
+      vacc2x2 = wasm_i32x4_add(vacc2x2, wasm_i32x4_extend_low_i16x8(vprod2x2));
+      vacc2x2 = wasm_i32x4_add(vacc2x2, wasm_i32x4_extend_high_i16x8(vprod2x2));
+      const v128_t vxb3 = wasm_v128_load((const int16_t*) w + 24);
+
+      const v128_t vprod0x3 = wasm_i16x8_mul(vxa0, vxb3);
+      vacc0x3 = wasm_i32x4_add(vacc0x3, wasm_i32x4_extend_low_i16x8(vprod0x3));
+      vacc0x3 = wasm_i32x4_add(vacc0x3, wasm_i32x4_extend_high_i16x8(vprod0x3));
+      const v128_t vprod1x3 = wasm_i16x8_mul(vxa1, vxb3);
+      vacc1x3 = wasm_i32x4_add(vacc1x3, wasm_i32x4_extend_low_i16x8(vprod1x3));
+      vacc1x3 = wasm_i32x4_add(vacc1x3, wasm_i32x4_extend_high_i16x8(vprod1x3));
+      const v128_t vprod2x3 = wasm_i16x8_mul(vxa2, vxb3);
+      vacc2x3 = wasm_i32x4_add(vacc2x3, wasm_i32x4_extend_low_i16x8(vprod2x3));
+      vacc2x3 = wasm_i32x4_add(vacc2x3, wasm_i32x4_extend_high_i16x8(vprod2x3));
+
+      w = (const void*) ((const int16_t*) w + 32);
+      k += 8 * sizeof(int8_t);
+    }
+
+    const v128_t vacc0x02 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x0, vacc0x2, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x0, vacc0x2, 2, 6, 3, 7));
+    const v128_t vacc0x13 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x1, vacc0x3, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x1, vacc0x3, 2, 6, 3, 7));
+    const v128_t vacc1x02 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc1x0, vacc1x2, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc1x0, vacc1x2, 2, 6, 3, 7));
+    const v128_t vacc1x13 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc1x1, vacc1x3, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc1x1, vacc1x3, 2, 6, 3, 7));
+    const v128_t vacc2x02 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc2x0, vacc2x2, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc2x0, vacc2x2, 2, 6, 3, 7));
+    const v128_t vacc2x13 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc2x1, vacc2x3, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc2x1, vacc2x3, 2, 6, 3, 7));
+
+    v128_t vacc0x0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x02, vacc0x13, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x02, vacc0x13, 2, 6, 3, 7));
+    v128_t vacc1x0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc1x02, vacc1x13, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc1x02, vacc1x13, 2, 6, 3, 7));
+    v128_t vacc2x0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc2x02, vacc2x13, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc2x02, vacc2x13, 2, 6, 3, 7));
+
+    const v128_t vsign0x0123 = wasm_i32x4_shr(vacc0x0123, 31);
+    const v128_t vsign1x0123 = wasm_i32x4_shr(vacc1x0123, 31);
+    const v128_t vsign2x0123 = wasm_i32x4_shr(vacc2x0123, 31);
+
+    const v128_t vacc0x01 = wasm_v32x4_shuffle(vacc0x0123, vsign0x0123, 0, 4, 1, 5);
+    const v128_t vacc1x01 = wasm_v32x4_shuffle(vacc1x0123, vsign1x0123, 0, 4, 1, 5);
+    const v128_t vacc2x01 = wasm_v32x4_shuffle(vacc2x0123, vsign2x0123, 0, 4, 1, 5);
+
+    const v128_t vmultiplier = wasm_v128_load(params->gemmlowp_wasmsimd.multiplier);
+    const v128_t vrounding = wasm_v128_load(params->gemmlowp_wasmsimd.rounding);
+    const v128_t vprod0x01 = wasm_i64x2_add(wasm_i64x2_mul(vacc0x01, vmultiplier), vrounding);
+    const v128_t vacc0x23 = wasm_v32x4_shuffle(vacc0x0123, vsign0x0123, 2, 6, 3, 7);
+    const v128_t vprod1x01 = wasm_i64x2_add(wasm_i64x2_mul(vacc1x01, vmultiplier), vrounding);
+    const v128_t vacc1x23 = wasm_v32x4_shuffle(vacc1x0123, vsign1x0123, 2, 6, 3, 7);
+    const v128_t vprod2x01 = wasm_i64x2_add(wasm_i64x2_mul(vacc2x01, vmultiplier), vrounding);
+    const v128_t vacc2x23 = wasm_v32x4_shuffle(vacc2x0123, vsign2x0123, 2, 6, 3, 7);
+
+    const v128_t vprod0x23 = wasm_i64x2_add(wasm_i64x2_mul(vacc0x23, vmultiplier), vrounding);
+    const v128_t vprod1x23 = wasm_i64x2_add(wasm_i64x2_mul(vacc1x23, vmultiplier), vrounding);
+    const v128_t vprod2x23 = wasm_i64x2_add(wasm_i64x2_mul(vacc2x23, vmultiplier), vrounding);
+
+    const v128_t vq31prod0x0123 = wasm_v32x4_shuffle(vprod0x01, vprod0x23, 1, 3, 5, 7);
+    const v128_t vq31prod1x0123 = wasm_v32x4_shuffle(vprod1x01, vprod1x23, 1, 3, 5, 7);
+    const v128_t vq31prod2x0123 = wasm_v32x4_shuffle(vprod2x01, vprod2x23, 1, 3, 5, 7);
+
+    const v128_t vremainder_mask = wasm_v128_load(params->gemmlowp_wasmsimd.remainder_mask);
+    const v128_t vrem0x0123 = wasm_i32x4_add(wasm_v128_and(vq31prod0x0123, vremainder_mask), wasm_i32x4_shr(vq31prod0x0123, 31));
+    const v128_t vrem1x0123 = wasm_i32x4_add(wasm_v128_and(vq31prod1x0123, vremainder_mask), wasm_i32x4_shr(vq31prod1x0123, 31));
+    const v128_t vrem2x0123 = wasm_i32x4_add(wasm_v128_and(vq31prod2x0123, vremainder_mask), wasm_i32x4_shr(vq31prod2x0123, 31));
+
+    const v128_t vthreshold = wasm_v128_load(params->gemmlowp_wasmsimd.remainder_threshold);
+    const int32_t vshift = params->gemmlowp_wasmsimd.shift;
+    vacc0x0123 = wasm_i32x4_sub(wasm_i32x4_shr(vq31prod0x0123, vshift), wasm_i32x4_gt(vrem0x0123, vthreshold));
+    vacc1x0123 = wasm_i32x4_sub(wasm_i32x4_shr(vq31prod1x0123, vshift), wasm_i32x4_gt(vrem1x0123, vthreshold));
+    vacc2x0123 = wasm_i32x4_sub(wasm_i32x4_shr(vq31prod2x0123, vshift), wasm_i32x4_gt(vrem2x0123, vthreshold));
+
+    const v128_t voutput_zero_point = wasm_v128_load(params->gemmlowp_wasmsimd.output_zero_point);
+    v128_t vacc01x0123 = wasm_i16x8_add_sat(wasm_i16x8_narrow_i32x4(vacc0x0123, vacc1x0123), voutput_zero_point);
+    v128_t vacc22x0123 = wasm_i16x8_add_sat(wasm_i16x8_narrow_i32x4(vacc2x0123, vacc2x0123), voutput_zero_point);
+
+    v128_t vout = wasm_i8x16_narrow_i16x8(vacc01x0123, vacc22x0123);
+
+    const v128_t voutput_min = wasm_v128_load(params->gemmlowp_wasmsimd.output_min);
+    vout = wasm_i8x16_max(vout, voutput_min);
+
+    const v128_t voutput_max = wasm_v128_load(params->gemmlowp_wasmsimd.output_max);
+    vout = wasm_i8x16_min(vout, voutput_max);
+
+    if (nc >= 4) {
+      *((float*) c0) = (float) wasm_f32x4_extract_lane(vout, 0);
+      *((float*) c1) = (float) wasm_f32x4_extract_lane(vout, 1);
+      *((float*) c2) = (float) wasm_f32x4_extract_lane(vout, 2);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) wasm_i16x8_extract_lane(vout, 0);
+        c0 += 2;
+        *((uint16_t*) c1) = (uint16_t) wasm_i16x8_extract_lane(vout, 2);
+        c1 += 2;
+        *((uint16_t*) c2) = (uint16_t) wasm_i16x8_extract_lane(vout, 4);
+        c2 += 2;
+        vout = wasm_u32x4_shr(vout, 16);
+      }
+      if (nc & 1) {
+        *c0 = (int8_t) wasm_i8x16_extract_lane(vout, 0);
+        *c1 = (int8_t) wasm_i8x16_extract_lane(vout, 4);
+        *c2 = (int8_t) wasm_i8x16_extract_lane(vout, 8);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-xop.c b/src/qs8-gemm/gen/3x4c8-xw-minmax-fp32-xop.c
similarity index 64%
copy from src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-xop.c
copy to src/qs8-gemm/gen/3x4c8-xw-minmax-fp32-xop.c
index 54a91c4..2719fb6 100644
--- a/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-xop.c
+++ b/src/qs8-gemm/gen/3x4c8-xw-minmax-fp32-xop.c
@@ -20,7 +20,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__xop(
+void xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__xop(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -33,7 +33,7 @@
     const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
 {
   assert(mr != 0);
-  assert(mr <= 2);
+  assert(mr <= 3);
   assert(nc != 0);
   assert(kc != 0);
   assert(kc % sizeof(int8_t) == 0);
@@ -46,10 +46,16 @@
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
-  if XNN_UNPREDICTABLE(mr != 2) {
+  if XNN_UNPREDICTABLE(mr < 2) {
     a1 = a0;
     c1 = c0;
   }
+  const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
 
   do {
     __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
@@ -60,6 +66,10 @@
     __m128i vacc1x1 = vacc0x1;
     __m128i vacc1x2 = vacc0x2;
     __m128i vacc1x3 = vacc0x3;
+    __m128i vacc2x0 = vacc0x0;
+    __m128i vacc2x1 = vacc0x1;
+    __m128i vacc2x2 = vacc0x2;
+    __m128i vacc2x3 = vacc0x3;
     w = (const void*) ((const int32_t*) w + 4);
 
     size_t k = 0;
@@ -70,23 +80,30 @@
       const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
       const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
       a1 += 8;
+      const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+      const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
+      a2 += 8;
 
       const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
 
       vacc0x0 = _mm_maddd_epi16(vxa0, vxb0, vacc0x0);
       vacc1x0 = _mm_maddd_epi16(vxa1, vxb0, vacc1x0);
+      vacc2x0 = _mm_maddd_epi16(vxa2, vxb0, vacc2x0);
       const __m128i vxb1 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 8));
 
       vacc0x1 = _mm_maddd_epi16(vxa0, vxb1, vacc0x1);
       vacc1x1 = _mm_maddd_epi16(vxa1, vxb1, vacc1x1);
+      vacc2x1 = _mm_maddd_epi16(vxa2, vxb1, vacc2x1);
       const __m128i vxb2 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 16));
 
       vacc0x2 = _mm_maddd_epi16(vxa0, vxb2, vacc0x2);
       vacc1x2 = _mm_maddd_epi16(vxa1, vxb2, vacc1x2);
+      vacc2x2 = _mm_maddd_epi16(vxa2, vxb2, vacc2x2);
       const __m128i vxb3 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 24));
 
       vacc0x3 = _mm_maddd_epi16(vxa0, vxb3, vacc0x3);
       vacc1x3 = _mm_maddd_epi16(vxa1, vxb3, vacc1x3);
+      vacc2x3 = _mm_maddd_epi16(vxa2, vxb3, vacc2x3);
 
       w = (const void*) ((const int16_t*) w + 32);
       k += 8 * sizeof(int8_t);
@@ -96,61 +113,48 @@
     const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
     const __m128i vacc1x01 = _mm_hadd_epi32(vacc1x0, vacc1x1);
     const __m128i vacc1x23 = _mm_hadd_epi32(vacc1x2, vacc1x3);
+    const __m128i vacc2x01 = _mm_hadd_epi32(vacc2x0, vacc2x1);
+    const __m128i vacc2x23 = _mm_hadd_epi32(vacc2x2, vacc2x3);
 
     __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
     __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
+    __m128i vacc2x0123 = _mm_hadd_epi32(vacc2x01, vacc2x23);
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.rounding);
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+    __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
 
-    const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
-    const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
+    const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale);
+    vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale);
 
-    const __m128i vprod0x02 = _mm_add_epi64(_mm_mul_epi32(vacc0x0123, vmultiplier), vrounding);
-    const __m128i vprod1x02 = _mm_add_epi64(_mm_mul_epi32(vacc1x0123, vmultiplier), vrounding);
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+    vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
 
-    const __m128i vprod0x13 = _mm_add_epi64(_mm_mul_epi32(vacc0x1133, vmultiplier), vrounding);
-    const __m128i vprod1x13 = _mm_add_epi64(_mm_mul_epi32(vacc1x1133, vmultiplier), vrounding);
-
-    const __m128i vq31prod0x02 = _mm_srli_epi64(vprod0x02, 31);
-    const __m128i vq31prod0x13 = _mm_add_epi64(vprod0x13, vprod0x13);
-    const __m128i vq31prod1x02 = _mm_srli_epi64(vprod1x02, 31);
-    const __m128i vq31prod1x13 = _mm_add_epi64(vprod1x13, vprod1x13);
-
-    const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
-    const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
-
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_mask);
-    const __m128i vrem0x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
-    const __m128i vrem1x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
-
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_threshold);
-    const __m128i vshift = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.shift);
-    vacc0x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
-    vacc1x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
-
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+    __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
 
 
-    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
+    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
 
-    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
-    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_max));
 
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
+      *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
 
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
 
       a0 = (const int8_t*) ((uintptr_t) a0 - kc);
       a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
 
       nc -= 4;
     } else {
@@ -159,11 +163,14 @@
         c0 += 2;
         *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
         c1 += 2;
+        *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4);
+        c2 += 2;
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
         *c0 = (int8_t) _mm_extract_epi8(vout, 0);
         *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+        *c2 = (int8_t) _mm_extract_epi8(vout, 8);
       }
 
       nc = 0;
diff --git a/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-avx.c b/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-avx.c
deleted file mode 100644
index 0e1fd6b..0000000
--- a/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-avx.c
+++ /dev/null
@@ -1,204 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/qs8-gemm/MRx4c8-sse.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <smmintrin.h>
-
-#include <xnnpack/gemm.h>
-#include <xnnpack/math.h>
-
-
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__avx(
-    size_t mr,
-    size_t nc,
-    size_t kc,
-    const int8_t* restrict a,
-    size_t a_stride,
-    const void* restrict w,
-    int8_t* restrict c,
-    size_t cm_stride,
-    size_t cn_stride,
-    const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
-{
-  assert(mr != 0);
-  assert(mr <= 3);
-  assert(nc != 0);
-  assert(kc != 0);
-  assert(kc % sizeof(int8_t) == 0);
-  assert(a != NULL);
-  assert(w != NULL);
-  assert(c != NULL);
-
-  kc = round_up_po2(kc, 8);
-  const int8_t* a0 = a;
-  int8_t* c0 = c;
-  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
-  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
-  if XNN_UNPREDICTABLE(mr < 2) {
-    a1 = a0;
-    c1 = c0;
-  }
-  const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
-  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
-  if XNN_UNPREDICTABLE(mr <= 2) {
-    a2 = a1;
-    c2 = c1;
-  }
-
-  do {
-    __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
-    __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
-    __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
-    __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
-    __m128i vacc1x0 = vacc0x0;
-    __m128i vacc1x1 = vacc0x1;
-    __m128i vacc1x2 = vacc0x2;
-    __m128i vacc1x3 = vacc0x3;
-    __m128i vacc2x0 = vacc0x0;
-    __m128i vacc2x1 = vacc0x1;
-    __m128i vacc2x2 = vacc0x2;
-    __m128i vacc2x3 = vacc0x3;
-    w = (const void*) ((const int32_t*) w + 4);
-
-    size_t k = 0;
-    while (k < kc) {
-      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
-      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
-      a0 += 8;
-      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
-      const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
-      a1 += 8;
-      const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
-      const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
-      a2 += 8;
-
-      const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
-
-      vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
-      vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
-      vacc2x0 = _mm_add_epi32(vacc2x0, _mm_madd_epi16(vxa2, vxb0));
-      const __m128i vxb1 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 8));
-
-      vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
-      vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
-      vacc2x1 = _mm_add_epi32(vacc2x1, _mm_madd_epi16(vxa2, vxb1));
-      const __m128i vxb2 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 16));
-
-      vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
-      vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
-      vacc2x2 = _mm_add_epi32(vacc2x2, _mm_madd_epi16(vxa2, vxb2));
-      const __m128i vxb3 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 24));
-
-      vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
-      vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
-      vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3));
-
-      w = (const void*) ((const int16_t*) w + 32);
-      k += 8 * sizeof(int8_t);
-    }
-
-    const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
-    const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
-    const __m128i vacc1x01 = _mm_hadd_epi32(vacc1x0, vacc1x1);
-    const __m128i vacc1x23 = _mm_hadd_epi32(vacc1x2, vacc1x3);
-    const __m128i vacc2x01 = _mm_hadd_epi32(vacc2x0, vacc2x1);
-    const __m128i vacc2x23 = _mm_hadd_epi32(vacc2x2, vacc2x3);
-
-    __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
-    __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
-    __m128i vacc2x0123 = _mm_hadd_epi32(vacc2x01, vacc2x23);
-
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.rounding);
-
-    const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
-    const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
-    const __m128i vacc2x1133 = _mm_shuffle_epi32(vacc2x0123, _MM_SHUFFLE(3, 3, 1, 1));
-
-    const __m128i vprod0x02 = _mm_add_epi64(_mm_mul_epi32(vacc0x0123, vmultiplier), vrounding);
-    const __m128i vprod1x02 = _mm_add_epi64(_mm_mul_epi32(vacc1x0123, vmultiplier), vrounding);
-    const __m128i vprod2x02 = _mm_add_epi64(_mm_mul_epi32(vacc2x0123, vmultiplier), vrounding);
-
-    const __m128i vprod0x13 = _mm_add_epi64(_mm_mul_epi32(vacc0x1133, vmultiplier), vrounding);
-    const __m128i vprod1x13 = _mm_add_epi64(_mm_mul_epi32(vacc1x1133, vmultiplier), vrounding);
-    const __m128i vprod2x13 = _mm_add_epi64(_mm_mul_epi32(vacc2x1133, vmultiplier), vrounding);
-
-    const __m128i vq31prod0x02 = _mm_srli_epi64(vprod0x02, 31);
-    const __m128i vq31prod0x13 = _mm_add_epi64(vprod0x13, vprod0x13);
-    const __m128i vq31prod1x02 = _mm_srli_epi64(vprod1x02, 31);
-    const __m128i vq31prod1x13 = _mm_add_epi64(vprod1x13, vprod1x13);
-    const __m128i vq31prod2x02 = _mm_srli_epi64(vprod2x02, 31);
-    const __m128i vq31prod2x13 = _mm_add_epi64(vprod2x13, vprod2x13);
-
-    const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
-    const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
-    const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
-
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_mask);
-    const __m128i vrem0x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
-    const __m128i vrem1x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
-    const __m128i vrem2x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
-
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_threshold);
-    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->gemmlowp_sse4.shift);
-    vacc0x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
-    vacc1x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
-    vacc2x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod2x0123, vshift), _mm_cmpgt_epi32(vrem2x0123, vremainder_threshold));
-
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_zero_point);
-    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
-    __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
-
-
-    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
-
-    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
-    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
-
-    if (nc >= 4) {
-      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
-      *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
-      *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
-
-      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
-      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
-      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
-
-      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
-      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
-
-      nc -= 4;
-    } else {
-      if (nc & 2) {
-        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
-        c0 += 2;
-        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
-        c1 += 2;
-        *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4);
-        c2 += 2;
-        vout = _mm_srli_epi32(vout, 16);
-      }
-      if (nc & 1) {
-        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
-        *c1 = (int8_t) _mm_extract_epi8(vout, 4);
-        *c2 = (int8_t) _mm_extract_epi8(vout, 8);
-      }
-
-      nc = 0;
-    }
-  } while (nc != 0);
-}
diff --git a/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-sse2.c b/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-sse2.c
deleted file mode 100644
index 2a4c9b1..0000000
--- a/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-sse2.c
+++ /dev/null
@@ -1,240 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/qs8-gemm/MRx4c8-sse.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <emmintrin.h>
-
-#include <xnnpack/gemm.h>
-#include <xnnpack/math.h>
-
-
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse2(
-    size_t mr,
-    size_t nc,
-    size_t kc,
-    const int8_t* restrict a,
-    size_t a_stride,
-    const void* restrict w,
-    int8_t* restrict c,
-    size_t cm_stride,
-    size_t cn_stride,
-    const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
-{
-  assert(mr != 0);
-  assert(mr <= 3);
-  assert(nc != 0);
-  assert(kc != 0);
-  assert(kc % sizeof(int8_t) == 0);
-  assert(a != NULL);
-  assert(w != NULL);
-  assert(c != NULL);
-
-  kc = round_up_po2(kc, 8);
-  const int8_t* a0 = a;
-  int8_t* c0 = c;
-  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
-  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
-  if XNN_UNPREDICTABLE(mr < 2) {
-    a1 = a0;
-    c1 = c0;
-  }
-  const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
-  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
-  if XNN_UNPREDICTABLE(mr <= 2) {
-    a2 = a1;
-    c2 = c1;
-  }
-
-  do {
-    __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
-    __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
-    __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
-    __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
-    __m128i vacc1x0 = vacc0x0;
-    __m128i vacc1x1 = vacc0x1;
-    __m128i vacc1x2 = vacc0x2;
-    __m128i vacc1x3 = vacc0x3;
-    __m128i vacc2x0 = vacc0x0;
-    __m128i vacc2x1 = vacc0x1;
-    __m128i vacc2x2 = vacc0x2;
-    __m128i vacc2x3 = vacc0x3;
-    w = (const void*) ((const int32_t*) w + 4);
-
-    size_t k = 0;
-    while (k < kc) {
-      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
-      const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8);
-      a0 += 8;
-      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
-      const __m128i vxa1 = _mm_srai_epi16(_mm_unpacklo_epi8(va1, va1), 8);
-      a1 += 8;
-      const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
-      const __m128i vxa2 = _mm_srai_epi16(_mm_unpacklo_epi8(va2, va2), 8);
-      a2 += 8;
-
-      const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
-
-      vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
-      vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
-      vacc2x0 = _mm_add_epi32(vacc2x0, _mm_madd_epi16(vxa2, vxb0));
-      const __m128i vxb1 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 8));
-
-      vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
-      vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
-      vacc2x1 = _mm_add_epi32(vacc2x1, _mm_madd_epi16(vxa2, vxb1));
-      const __m128i vxb2 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 16));
-
-      vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
-      vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
-      vacc2x2 = _mm_add_epi32(vacc2x2, _mm_madd_epi16(vxa2, vxb2));
-      const __m128i vxb3 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 24));
-
-      vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
-      vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
-      vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3));
-
-      w = (const void*) ((const int16_t*) w + 32);
-      k += 8 * sizeof(int8_t);
-    }
-
-    const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2));
-    const __m128i vacc0x13 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x1, vacc0x3), _mm_unpackhi_epi32(vacc0x1, vacc0x3));
-    const __m128i vacc1x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc1x0, vacc1x2), _mm_unpackhi_epi32(vacc1x0, vacc1x2));
-    const __m128i vacc1x13 = _mm_add_epi32(_mm_unpacklo_epi32(vacc1x1, vacc1x3), _mm_unpackhi_epi32(vacc1x1, vacc1x3));
-    const __m128i vacc2x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc2x0, vacc2x2), _mm_unpackhi_epi32(vacc2x0, vacc2x2));
-    const __m128i vacc2x13 = _mm_add_epi32(_mm_unpacklo_epi32(vacc2x1, vacc2x3), _mm_unpackhi_epi32(vacc2x1, vacc2x3));
-
-    __m128i vacc0x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x02, vacc0x13), _mm_unpackhi_epi32(vacc0x02, vacc0x13));
-    __m128i vacc1x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc1x02, vacc1x13), _mm_unpackhi_epi32(vacc1x02, vacc1x13));
-    __m128i vacc2x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc2x02, vacc2x13), _mm_unpackhi_epi32(vacc2x02, vacc2x13));
-
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.rounding);
-
-    const __m128i vnmask0x0123 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc0x0123);
-    const __m128i vnmask1x0123 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc1x0123);
-    const __m128i vnmask2x0123 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc2x0123);
-
-    const __m128i vabsacc0x0123 = _mm_sub_epi32(_mm_xor_si128(vacc0x0123, vnmask0x0123), vnmask0x0123);
-    const __m128i vabsacc1x0123 = _mm_sub_epi32(_mm_xor_si128(vacc1x0123, vnmask1x0123), vnmask1x0123);
-    const __m128i vabsacc2x0123 = _mm_sub_epi32(_mm_xor_si128(vacc2x0123, vnmask2x0123), vnmask2x0123);
-
-    const __m128i vabsacc0x1133 = _mm_shuffle_epi32(vabsacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
-    const __m128i vabsacc1x1133 = _mm_shuffle_epi32(vabsacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
-    const __m128i vabsacc2x1133 = _mm_shuffle_epi32(vabsacc2x0123, _MM_SHUFFLE(3, 3, 1, 1));
-
-    const __m128i vabsprod0x02 = _mm_mul_epu32(vabsacc0x0123, vmultiplier);
-    const __m128i vabsprod1x02 = _mm_mul_epu32(vabsacc1x0123, vmultiplier);
-    const __m128i vabsprod2x02 = _mm_mul_epu32(vabsacc2x0123, vmultiplier);
-
-    const __m128i vnmask0x02 = _mm_shuffle_epi32(vnmask0x0123, _MM_SHUFFLE(2, 2, 0, 0));
-    const __m128i vnmask1x02 = _mm_shuffle_epi32(vnmask1x0123, _MM_SHUFFLE(2, 2, 0, 0));
-    const __m128i vnmask2x02 = _mm_shuffle_epi32(vnmask2x0123, _MM_SHUFFLE(2, 2, 0, 0));
-
-    const __m128i vprod0x02 = _mm_sub_epi64(_mm_xor_si128(vabsprod0x02, vnmask0x02), vnmask0x02);
-    const __m128i vprod1x02 = _mm_sub_epi64(_mm_xor_si128(vabsprod1x02, vnmask1x02), vnmask1x02);
-    const __m128i vprod2x02 = _mm_sub_epi64(_mm_xor_si128(vabsprod2x02, vnmask2x02), vnmask2x02);
-
-    const __m128i vq31prod0x02 = _mm_srli_epi64(_mm_add_epi64(vprod0x02, vrounding), 31);
-    const __m128i vq31prod1x02 = _mm_srli_epi64(_mm_add_epi64(vprod1x02, vrounding), 31);
-    const __m128i vq31prod2x02 = _mm_srli_epi64(_mm_add_epi64(vprod2x02, vrounding), 31);
-
-    const __m128i vabsprod0x13 = _mm_mul_epu32(vabsacc0x1133, vmultiplier);
-    const __m128i vabsprod1x13 = _mm_mul_epu32(vabsacc1x1133, vmultiplier);
-    const __m128i vabsprod2x13 = _mm_mul_epu32(vabsacc2x1133, vmultiplier);
-
-    const __m128i vnmask0x13 = _mm_shuffle_epi32(vnmask0x0123, _MM_SHUFFLE(3, 3, 1, 1));
-    const __m128i vnmask1x13 = _mm_shuffle_epi32(vnmask1x0123, _MM_SHUFFLE(3, 3, 1, 1));
-    const __m128i vnmask2x13 = _mm_shuffle_epi32(vnmask2x0123, _MM_SHUFFLE(3, 3, 1, 1));
-
-    const __m128i vprod0x13 = _mm_sub_epi64(_mm_xor_si128(vabsprod0x13, vnmask0x13), vnmask0x13);
-    const __m128i vprod1x13 = _mm_sub_epi64(_mm_xor_si128(vabsprod1x13, vnmask1x13), vnmask1x13);
-    const __m128i vprod2x13 = _mm_sub_epi64(_mm_xor_si128(vabsprod2x13, vnmask2x13), vnmask2x13);
-
-    const __m128i vq31prod0x13 = _mm_srli_epi64(_mm_add_epi64(vprod0x13, vrounding), 31);
-    const __m128i vq31prod1x13 = _mm_srli_epi64(_mm_add_epi64(vprod1x13, vrounding), 31);
-    const __m128i vq31prod2x13 = _mm_srli_epi64(_mm_add_epi64(vprod2x13, vrounding), 31);
-
-    const __m128i vq31prod0x0213 = _mm_castps_si128(_mm_shuffle_ps(
-        _mm_castsi128_ps(vq31prod0x02), _mm_castsi128_ps(vq31prod0x13), _MM_SHUFFLE(2, 0, 2, 0)));
-    const __m128i vq31prod1x0213 = _mm_castps_si128(_mm_shuffle_ps(
-        _mm_castsi128_ps(vq31prod1x02), _mm_castsi128_ps(vq31prod1x13), _MM_SHUFFLE(2, 0, 2, 0)));
-    const __m128i vq31prod2x0213 = _mm_castps_si128(_mm_shuffle_ps(
-        _mm_castsi128_ps(vq31prod2x02), _mm_castsi128_ps(vq31prod2x13), _MM_SHUFFLE(2, 0, 2, 0)));
-
-    const __m128i vq31prod0x0123 = _mm_shuffle_epi32(vq31prod0x0213, _MM_SHUFFLE(3, 1, 2, 0));
-    const __m128i vq31prod1x0123 = _mm_shuffle_epi32(vq31prod1x0213, _MM_SHUFFLE(3, 1, 2, 0));
-    const __m128i vq31prod2x0123 = _mm_shuffle_epi32(vq31prod2x0213, _MM_SHUFFLE(3, 1, 2, 0));
-
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.remainder_mask);
-    const __m128i vrem0x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
-    const __m128i vrem1x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
-    const __m128i vrem2x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
-
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.remainder_threshold);
-    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->gemmlowp_sse2.shift);
-    vacc0x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
-    vacc1x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
-    vacc2x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod2x0123, vshift), _mm_cmpgt_epi32(vrem2x0123, vremainder_threshold));
-
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_zero_point);
-    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
-    __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
-
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
-    vacc22x0123 = _mm_min_epi16(_mm_max_epi16(vacc22x0123, voutput_min), voutput_max);
-
-    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
-
-
-    if (nc >= 4) {
-      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
-      vout = _mm_srli_si128(vout, 4);
-      *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(vout);
-      vout = _mm_srli_si128(vout, 4);
-      *((uint32_t*) c2) = (uint32_t) _mm_cvtsi128_si32(vout);
-
-      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
-      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
-      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
-
-      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
-      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
-
-      nc -= 4;
-    } else {
-      if (nc & 2) {
-        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
-        c0 += 2;
-        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
-        c1 += 2;
-        *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4);
-        c2 += 2;
-        vout = _mm_srli_epi32(vout, 16);
-      }
-      if (nc & 1) {
-        *c0 = (int8_t) _mm_cvtsi128_si32(vout);
-        *c1 = (int8_t) _mm_extract_epi16(vout, 2);
-        *c2 = (int8_t) _mm_extract_epi16(vout, 4);
-      }
-
-      nc = 0;
-    }
-  } while (nc != 0);
-}
diff --git a/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-ssse3.c b/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-ssse3.c
deleted file mode 100644
index a7d781b..0000000
--- a/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-ssse3.c
+++ /dev/null
@@ -1,240 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/qs8-gemm/MRx4c8-sse.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <tmmintrin.h>
-
-#include <xnnpack/gemm.h>
-#include <xnnpack/math.h>
-
-
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__ssse3(
-    size_t mr,
-    size_t nc,
-    size_t kc,
-    const int8_t* restrict a,
-    size_t a_stride,
-    const void* restrict w,
-    int8_t* restrict c,
-    size_t cm_stride,
-    size_t cn_stride,
-    const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
-{
-  assert(mr != 0);
-  assert(mr <= 3);
-  assert(nc != 0);
-  assert(kc != 0);
-  assert(kc % sizeof(int8_t) == 0);
-  assert(a != NULL);
-  assert(w != NULL);
-  assert(c != NULL);
-
-  kc = round_up_po2(kc, 8);
-  const int8_t* a0 = a;
-  int8_t* c0 = c;
-  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
-  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
-  if XNN_UNPREDICTABLE(mr < 2) {
-    a1 = a0;
-    c1 = c0;
-  }
-  const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
-  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
-  if XNN_UNPREDICTABLE(mr <= 2) {
-    a2 = a1;
-    c2 = c1;
-  }
-
-  do {
-    __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
-    __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
-    __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
-    __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
-    __m128i vacc1x0 = vacc0x0;
-    __m128i vacc1x1 = vacc0x1;
-    __m128i vacc1x2 = vacc0x2;
-    __m128i vacc1x3 = vacc0x3;
-    __m128i vacc2x0 = vacc0x0;
-    __m128i vacc2x1 = vacc0x1;
-    __m128i vacc2x2 = vacc0x2;
-    __m128i vacc2x3 = vacc0x3;
-    w = (const void*) ((const int32_t*) w + 4);
-
-    size_t k = 0;
-    while (k < kc) {
-      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
-      const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8);
-      a0 += 8;
-      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
-      const __m128i vxa1 = _mm_srai_epi16(_mm_unpacklo_epi8(va1, va1), 8);
-      a1 += 8;
-      const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
-      const __m128i vxa2 = _mm_srai_epi16(_mm_unpacklo_epi8(va2, va2), 8);
-      a2 += 8;
-
-      const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
-
-      vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
-      vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
-      vacc2x0 = _mm_add_epi32(vacc2x0, _mm_madd_epi16(vxa2, vxb0));
-      const __m128i vxb1 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 8));
-
-      vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
-      vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
-      vacc2x1 = _mm_add_epi32(vacc2x1, _mm_madd_epi16(vxa2, vxb1));
-      const __m128i vxb2 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 16));
-
-      vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
-      vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
-      vacc2x2 = _mm_add_epi32(vacc2x2, _mm_madd_epi16(vxa2, vxb2));
-      const __m128i vxb3 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 24));
-
-      vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
-      vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
-      vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3));
-
-      w = (const void*) ((const int16_t*) w + 32);
-      k += 8 * sizeof(int8_t);
-    }
-
-    const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
-    const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
-    const __m128i vacc1x01 = _mm_hadd_epi32(vacc1x0, vacc1x1);
-    const __m128i vacc1x23 = _mm_hadd_epi32(vacc1x2, vacc1x3);
-    const __m128i vacc2x01 = _mm_hadd_epi32(vacc2x0, vacc2x1);
-    const __m128i vacc2x23 = _mm_hadd_epi32(vacc2x2, vacc2x3);
-
-    __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
-    __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
-    __m128i vacc2x0123 = _mm_hadd_epi32(vacc2x01, vacc2x23);
-
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.rounding);
-
-    const __m128i vnmask0x0123 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc0x0123);
-    const __m128i vnmask1x0123 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc1x0123);
-    const __m128i vnmask2x0123 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc2x0123);
-
-    const __m128i vabsacc0x0123 = _mm_abs_epi32(vacc0x0123);
-    const __m128i vabsacc1x0123 = _mm_abs_epi32(vacc1x0123);
-    const __m128i vabsacc2x0123 = _mm_abs_epi32(vacc2x0123);
-
-    const __m128i vabsacc0x1133 = _mm_shuffle_epi32(vabsacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
-    const __m128i vabsacc1x1133 = _mm_shuffle_epi32(vabsacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
-    const __m128i vabsacc2x1133 = _mm_shuffle_epi32(vabsacc2x0123, _MM_SHUFFLE(3, 3, 1, 1));
-
-    const __m128i vabsprod0x02 = _mm_mul_epu32(vabsacc0x0123, vmultiplier);
-    const __m128i vabsprod1x02 = _mm_mul_epu32(vabsacc1x0123, vmultiplier);
-    const __m128i vabsprod2x02 = _mm_mul_epu32(vabsacc2x0123, vmultiplier);
-
-    const __m128i vnmask0x02 = _mm_shuffle_epi32(vnmask0x0123, _MM_SHUFFLE(2, 2, 0, 0));
-    const __m128i vnmask1x02 = _mm_shuffle_epi32(vnmask1x0123, _MM_SHUFFLE(2, 2, 0, 0));
-    const __m128i vnmask2x02 = _mm_shuffle_epi32(vnmask2x0123, _MM_SHUFFLE(2, 2, 0, 0));
-
-    const __m128i vprod0x02 = _mm_sub_epi64(_mm_xor_si128(vabsprod0x02, vnmask0x02), vnmask0x02);
-    const __m128i vprod1x02 = _mm_sub_epi64(_mm_xor_si128(vabsprod1x02, vnmask1x02), vnmask1x02);
-    const __m128i vprod2x02 = _mm_sub_epi64(_mm_xor_si128(vabsprod2x02, vnmask2x02), vnmask2x02);
-
-    const __m128i vq31prod0x02 = _mm_srli_epi64(_mm_add_epi64(vprod0x02, vrounding), 31);
-    const __m128i vq31prod1x02 = _mm_srli_epi64(_mm_add_epi64(vprod1x02, vrounding), 31);
-    const __m128i vq31prod2x02 = _mm_srli_epi64(_mm_add_epi64(vprod2x02, vrounding), 31);
-
-    const __m128i vabsprod0x13 = _mm_mul_epu32(vabsacc0x1133, vmultiplier);
-    const __m128i vabsprod1x13 = _mm_mul_epu32(vabsacc1x1133, vmultiplier);
-    const __m128i vabsprod2x13 = _mm_mul_epu32(vabsacc2x1133, vmultiplier);
-
-    const __m128i vnmask0x13 = _mm_shuffle_epi32(vnmask0x0123, _MM_SHUFFLE(3, 3, 1, 1));
-    const __m128i vnmask1x13 = _mm_shuffle_epi32(vnmask1x0123, _MM_SHUFFLE(3, 3, 1, 1));
-    const __m128i vnmask2x13 = _mm_shuffle_epi32(vnmask2x0123, _MM_SHUFFLE(3, 3, 1, 1));
-
-    const __m128i vprod0x13 = _mm_sub_epi64(_mm_xor_si128(vabsprod0x13, vnmask0x13), vnmask0x13);
-    const __m128i vprod1x13 = _mm_sub_epi64(_mm_xor_si128(vabsprod1x13, vnmask1x13), vnmask1x13);
-    const __m128i vprod2x13 = _mm_sub_epi64(_mm_xor_si128(vabsprod2x13, vnmask2x13), vnmask2x13);
-
-    const __m128i vq31prod0x13 = _mm_srli_epi64(_mm_add_epi64(vprod0x13, vrounding), 31);
-    const __m128i vq31prod1x13 = _mm_srli_epi64(_mm_add_epi64(vprod1x13, vrounding), 31);
-    const __m128i vq31prod2x13 = _mm_srli_epi64(_mm_add_epi64(vprod2x13, vrounding), 31);
-
-    const __m128i vq31prod0x0213 = _mm_castps_si128(_mm_shuffle_ps(
-        _mm_castsi128_ps(vq31prod0x02), _mm_castsi128_ps(vq31prod0x13), _MM_SHUFFLE(2, 0, 2, 0)));
-    const __m128i vq31prod1x0213 = _mm_castps_si128(_mm_shuffle_ps(
-        _mm_castsi128_ps(vq31prod1x02), _mm_castsi128_ps(vq31prod1x13), _MM_SHUFFLE(2, 0, 2, 0)));
-    const __m128i vq31prod2x0213 = _mm_castps_si128(_mm_shuffle_ps(
-        _mm_castsi128_ps(vq31prod2x02), _mm_castsi128_ps(vq31prod2x13), _MM_SHUFFLE(2, 0, 2, 0)));
-
-    const __m128i vq31prod0x0123 = _mm_shuffle_epi32(vq31prod0x0213, _MM_SHUFFLE(3, 1, 2, 0));
-    const __m128i vq31prod1x0123 = _mm_shuffle_epi32(vq31prod1x0213, _MM_SHUFFLE(3, 1, 2, 0));
-    const __m128i vq31prod2x0123 = _mm_shuffle_epi32(vq31prod2x0213, _MM_SHUFFLE(3, 1, 2, 0));
-
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.remainder_mask);
-    const __m128i vrem0x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
-    const __m128i vrem1x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
-    const __m128i vrem2x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
-
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.remainder_threshold);
-    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->gemmlowp_sse2.shift);
-    vacc0x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
-    vacc1x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
-    vacc2x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod2x0123, vshift), _mm_cmpgt_epi32(vrem2x0123, vremainder_threshold));
-
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_zero_point);
-    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
-    __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
-
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
-    vacc22x0123 = _mm_min_epi16(_mm_max_epi16(vacc22x0123, voutput_min), voutput_max);
-
-    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
-
-
-    if (nc >= 4) {
-      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
-      vout = _mm_srli_si128(vout, 4);
-      *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(vout);
-      vout = _mm_srli_si128(vout, 4);
-      *((uint32_t*) c2) = (uint32_t) _mm_cvtsi128_si32(vout);
-
-      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
-      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
-      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
-
-      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
-      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
-
-      nc -= 4;
-    } else {
-      if (nc & 2) {
-        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
-        c0 += 2;
-        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
-        c1 += 2;
-        *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4);
-        c2 += 2;
-        vout = _mm_srli_epi32(vout, 16);
-      }
-      if (nc & 1) {
-        *c0 = (int8_t) _mm_cvtsi128_si32(vout);
-        *c1 = (int8_t) _mm_extract_epi16(vout, 2);
-        *c2 = (int8_t) _mm_extract_epi16(vout, 4);
-      }
-
-      nc = 0;
-    }
-  } while (nc != 0);
-}
diff --git a/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-xop.c b/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-xop.c
deleted file mode 100644
index a54da33..0000000
--- a/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-xop.c
+++ /dev/null
@@ -1,209 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/qs8-gemm/MRx4c8-sse.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#if defined(__GNUC__) || defined(__clang__)
-  #include <x86intrin.h>
-#else
-  #include <immintrin.h>
-  #include <ammintrin.h>
-#endif
-
-#include <xnnpack/gemm.h>
-#include <xnnpack/math.h>
-
-
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__xop(
-    size_t mr,
-    size_t nc,
-    size_t kc,
-    const int8_t* restrict a,
-    size_t a_stride,
-    const void* restrict w,
-    int8_t* restrict c,
-    size_t cm_stride,
-    size_t cn_stride,
-    const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
-{
-  assert(mr != 0);
-  assert(mr <= 3);
-  assert(nc != 0);
-  assert(kc != 0);
-  assert(kc % sizeof(int8_t) == 0);
-  assert(a != NULL);
-  assert(w != NULL);
-  assert(c != NULL);
-
-  kc = round_up_po2(kc, 8);
-  const int8_t* a0 = a;
-  int8_t* c0 = c;
-  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
-  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
-  if XNN_UNPREDICTABLE(mr < 2) {
-    a1 = a0;
-    c1 = c0;
-  }
-  const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
-  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
-  if XNN_UNPREDICTABLE(mr <= 2) {
-    a2 = a1;
-    c2 = c1;
-  }
-
-  do {
-    __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
-    __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
-    __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
-    __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
-    __m128i vacc1x0 = vacc0x0;
-    __m128i vacc1x1 = vacc0x1;
-    __m128i vacc1x2 = vacc0x2;
-    __m128i vacc1x3 = vacc0x3;
-    __m128i vacc2x0 = vacc0x0;
-    __m128i vacc2x1 = vacc0x1;
-    __m128i vacc2x2 = vacc0x2;
-    __m128i vacc2x3 = vacc0x3;
-    w = (const void*) ((const int32_t*) w + 4);
-
-    size_t k = 0;
-    while (k < kc) {
-      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
-      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
-      a0 += 8;
-      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
-      const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
-      a1 += 8;
-      const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
-      const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
-      a2 += 8;
-
-      const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
-
-      vacc0x0 = _mm_maddd_epi16(vxa0, vxb0, vacc0x0);
-      vacc1x0 = _mm_maddd_epi16(vxa1, vxb0, vacc1x0);
-      vacc2x0 = _mm_maddd_epi16(vxa2, vxb0, vacc2x0);
-      const __m128i vxb1 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 8));
-
-      vacc0x1 = _mm_maddd_epi16(vxa0, vxb1, vacc0x1);
-      vacc1x1 = _mm_maddd_epi16(vxa1, vxb1, vacc1x1);
-      vacc2x1 = _mm_maddd_epi16(vxa2, vxb1, vacc2x1);
-      const __m128i vxb2 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 16));
-
-      vacc0x2 = _mm_maddd_epi16(vxa0, vxb2, vacc0x2);
-      vacc1x2 = _mm_maddd_epi16(vxa1, vxb2, vacc1x2);
-      vacc2x2 = _mm_maddd_epi16(vxa2, vxb2, vacc2x2);
-      const __m128i vxb3 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 24));
-
-      vacc0x3 = _mm_maddd_epi16(vxa0, vxb3, vacc0x3);
-      vacc1x3 = _mm_maddd_epi16(vxa1, vxb3, vacc1x3);
-      vacc2x3 = _mm_maddd_epi16(vxa2, vxb3, vacc2x3);
-
-      w = (const void*) ((const int16_t*) w + 32);
-      k += 8 * sizeof(int8_t);
-    }
-
-    const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
-    const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
-    const __m128i vacc1x01 = _mm_hadd_epi32(vacc1x0, vacc1x1);
-    const __m128i vacc1x23 = _mm_hadd_epi32(vacc1x2, vacc1x3);
-    const __m128i vacc2x01 = _mm_hadd_epi32(vacc2x0, vacc2x1);
-    const __m128i vacc2x23 = _mm_hadd_epi32(vacc2x2, vacc2x3);
-
-    __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
-    __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
-    __m128i vacc2x0123 = _mm_hadd_epi32(vacc2x01, vacc2x23);
-
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.rounding);
-
-    const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
-    const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
-    const __m128i vacc2x1133 = _mm_shuffle_epi32(vacc2x0123, _MM_SHUFFLE(3, 3, 1, 1));
-
-    const __m128i vprod0x02 = _mm_add_epi64(_mm_mul_epi32(vacc0x0123, vmultiplier), vrounding);
-    const __m128i vprod1x02 = _mm_add_epi64(_mm_mul_epi32(vacc1x0123, vmultiplier), vrounding);
-    const __m128i vprod2x02 = _mm_add_epi64(_mm_mul_epi32(vacc2x0123, vmultiplier), vrounding);
-
-    const __m128i vprod0x13 = _mm_add_epi64(_mm_mul_epi32(vacc0x1133, vmultiplier), vrounding);
-    const __m128i vprod1x13 = _mm_add_epi64(_mm_mul_epi32(vacc1x1133, vmultiplier), vrounding);
-    const __m128i vprod2x13 = _mm_add_epi64(_mm_mul_epi32(vacc2x1133, vmultiplier), vrounding);
-
-    const __m128i vq31prod0x02 = _mm_srli_epi64(vprod0x02, 31);
-    const __m128i vq31prod0x13 = _mm_add_epi64(vprod0x13, vprod0x13);
-    const __m128i vq31prod1x02 = _mm_srli_epi64(vprod1x02, 31);
-    const __m128i vq31prod1x13 = _mm_add_epi64(vprod1x13, vprod1x13);
-    const __m128i vq31prod2x02 = _mm_srli_epi64(vprod2x02, 31);
-    const __m128i vq31prod2x13 = _mm_add_epi64(vprod2x13, vprod2x13);
-
-    const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
-    const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
-    const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
-
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_mask);
-    const __m128i vrem0x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
-    const __m128i vrem1x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
-    const __m128i vrem2x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
-
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_threshold);
-    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->gemmlowp_sse4.shift);
-    vacc0x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
-    vacc1x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
-    vacc2x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod2x0123, vshift), _mm_cmpgt_epi32(vrem2x0123, vremainder_threshold));
-
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_zero_point);
-    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
-    __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
-
-
-    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
-
-    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
-    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
-
-    if (nc >= 4) {
-      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
-      *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
-      *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
-
-      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
-      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
-      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
-
-      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
-      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
-
-      nc -= 4;
-    } else {
-      if (nc & 2) {
-        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
-        c0 += 2;
-        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
-        c1 += 2;
-        *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4);
-        c2 += 2;
-        vout = _mm_srli_epi32(vout, 16);
-      }
-      if (nc & 1) {
-        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
-        *c1 = (int8_t) _mm_extract_epi8(vout, 4);
-        *c2 = (int8_t) _mm_extract_epi8(vout, 8);
-      }
-
-      nc = 0;
-    }
-  } while (nc != 0);
-}
diff --git a/src/qs8-gemm/gen/3x8c8-xw-minmax-gemmlowp-avx2.c b/src/qs8-gemm/gen/3x8c8-xw-minmax-gemmlowp-avx2.c
deleted file mode 100644
index 019873f..0000000
--- a/src/qs8-gemm/gen/3x8c8-xw-minmax-gemmlowp-avx2.c
+++ /dev/null
@@ -1,238 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/qs8-gemm/MRx8c8-avx2.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <immintrin.h>
-
-#include <xnnpack/gemm.h>
-#include <xnnpack/intrinsics-polyfill.h>
-#include <xnnpack/math.h>
-
-
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x8c8__avx2(
-    size_t mr,
-    size_t nc,
-    size_t kc,
-    const int8_t* restrict a,
-    size_t a_stride,
-    const void* restrict w,
-    int8_t* restrict c,
-    size_t cm_stride,
-    size_t cn_stride,
-    const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
-{
-  assert(mr != 0);
-  assert(mr <= 3);
-  assert(nc != 0);
-  assert(kc != 0);
-  assert(kc % sizeof(int8_t) == 0);
-  assert(a != NULL);
-  assert(w != NULL);
-  assert(c != NULL);
-
-  kc = round_up_po2(kc, 8);
-  const int8_t* a0 = a;
-  int8_t* c0 = c;
-  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
-  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
-  if XNN_UNPREDICTABLE(mr < 2) {
-    a1 = a0;
-    c1 = c0;
-  }
-  const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
-  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
-  if XNN_UNPREDICTABLE(mr <= 2) {
-    a2 = a1;
-    c2 = c1;
-  }
-
-  do {
-    const __m128i vbias0x0 = _mm_loadu_si32(w);
-    const __m128i vbias0x1 = _mm_loadu_si32((const int32_t*) w + 1);
-    __m256i vacc0x01 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x0), vbias0x1, 1);
-    const __m128i vbias0x2 = _mm_loadu_si32((const int32_t*) w + 2);
-    const __m128i vbias0x3 = _mm_loadu_si32((const int32_t*) w + 3);
-    __m256i vacc0x23 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x2), vbias0x3, 1);
-    const __m128i vbias0x4 = _mm_loadu_si32((const int32_t*) w + 4);
-    const __m128i vbias0x5 = _mm_loadu_si32((const int32_t*) w + 5);
-    __m256i vacc0x45 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x4), vbias0x5, 1);
-    const __m128i vbias0x6 = _mm_loadu_si32((const int32_t*) w + 6);
-    const __m128i vbias0x7 = _mm_loadu_si32((const int32_t*) w + 7);
-    __m256i vacc0x67 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x6), vbias0x7, 1);
-    __m256i vacc1x01 = vacc0x01;
-    __m256i vacc1x23 = vacc0x23;
-    __m256i vacc1x45 = vacc0x45;
-    __m256i vacc1x67 = vacc0x67;
-    __m256i vacc2x01 = vacc0x01;
-    __m256i vacc2x23 = vacc0x23;
-    __m256i vacc2x45 = vacc0x45;
-    __m256i vacc2x67 = vacc0x67;
-    w = (const void*) ((const int32_t*) w + 8);
-
-    size_t k = 0;
-    while (k < kc) {
-      const __m128i va0 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a0));
-      const __m256i vxa0 = _mm256_cvtepi8_epi16(va0);
-      a0 += 8;
-      const __m128i va1 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a1));
-      const __m256i vxa1 = _mm256_cvtepi8_epi16(va1);
-      a1 += 8;
-      const __m128i va2 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a2));
-      const __m256i vxa2 = _mm256_cvtepi8_epi16(va2);
-      a2 += 8;
-
-      const __m256i vxb01 = _mm256_load_si256((const __m256i*) w);
-
-      vacc0x01 = _mm256_add_epi32(vacc0x01, _mm256_madd_epi16(vxa0, vxb01));
-      vacc1x01 = _mm256_add_epi32(vacc1x01, _mm256_madd_epi16(vxa1, vxb01));
-      vacc2x01 = _mm256_add_epi32(vacc2x01, _mm256_madd_epi16(vxa2, vxb01));
-      const __m256i vxb23 = _mm256_load_si256((const __m256i*) ((const int16_t*) w + 16));
-
-      vacc0x23 = _mm256_add_epi32(vacc0x23, _mm256_madd_epi16(vxa0, vxb23));
-      vacc1x23 = _mm256_add_epi32(vacc1x23, _mm256_madd_epi16(vxa1, vxb23));
-      vacc2x23 = _mm256_add_epi32(vacc2x23, _mm256_madd_epi16(vxa2, vxb23));
-      const __m256i vxb45 = _mm256_load_si256((const __m256i*) ((const int16_t*) w + 32));
-
-      vacc0x45 = _mm256_add_epi32(vacc0x45, _mm256_madd_epi16(vxa0, vxb45));
-      vacc1x45 = _mm256_add_epi32(vacc1x45, _mm256_madd_epi16(vxa1, vxb45));
-      vacc2x45 = _mm256_add_epi32(vacc2x45, _mm256_madd_epi16(vxa2, vxb45));
-      const __m256i vxb67 = _mm256_load_si256((const __m256i*) ((const int16_t*) w + 48));
-
-      vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67));
-      vacc1x67 = _mm256_add_epi32(vacc1x67, _mm256_madd_epi16(vxa1, vxb67));
-      vacc2x67 = _mm256_add_epi32(vacc2x67, _mm256_madd_epi16(vxa2, vxb67));
-
-      w = (const void*) ((const int16_t*) w + 64);
-      k += 8 * sizeof(int8_t);
-    }
-
-    const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23);
-    const __m256i vacc0x4657 = _mm256_hadd_epi32(vacc0x45, vacc0x67);
-    const __m256i vacc1x0213 = _mm256_hadd_epi32(vacc1x01, vacc1x23);
-    const __m256i vacc1x4657 = _mm256_hadd_epi32(vacc1x45, vacc1x67);
-    const __m256i vacc2x0213 = _mm256_hadd_epi32(vacc2x01, vacc2x23);
-    const __m256i vacc2x4657 = _mm256_hadd_epi32(vacc2x45, vacc2x67);
-
-    const __m256i vacc0x02461357 = _mm256_hadd_epi32(vacc0x0213, vacc0x4657);
-    const __m256i vacc1x02461357 = _mm256_hadd_epi32(vacc1x0213, vacc1x4657);
-    const __m256i vacc2x02461357 = _mm256_hadd_epi32(vacc2x0213, vacc2x4657);
-
-    const __m256i vpermute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
-    __m256i vacc0x01234567 = _mm256_permutevar8x32_epi32(vacc0x02461357, vpermute_mask);
-    __m256i vacc1x01234567 = _mm256_permutevar8x32_epi32(vacc1x02461357, vpermute_mask);
-    __m256i vacc2x01234567 = _mm256_permutevar8x32_epi32(vacc2x02461357, vpermute_mask);
-
-    const __m256i vmultiplier = _mm256_load_si256((const __m256i*) params->gemmlowp_avx2.multiplier);
-    const __m256i vrounding = _mm256_load_si256((const __m256i*) params->gemmlowp_avx2.rounding);
-
-    const __m256i vacc0x11335577 = _mm256_srli_epi64(vacc0x01234567, 32);
-    const __m256i vacc1x11335577 = _mm256_srli_epi64(vacc1x01234567, 32);
-    const __m256i vacc2x11335577 = _mm256_srli_epi64(vacc2x01234567, 32);
-
-    const __m256i vprod0x0246 = _mm256_add_epi64(_mm256_mul_epi32(vacc0x01234567, vmultiplier), vrounding);
-    const __m256i vprod1x0246 = _mm256_add_epi64(_mm256_mul_epi32(vacc1x01234567, vmultiplier), vrounding);
-    const __m256i vprod2x0246 = _mm256_add_epi64(_mm256_mul_epi32(vacc2x01234567, vmultiplier), vrounding);
-
-    const __m256i vprod0x1357 = _mm256_add_epi64(_mm256_mul_epi32(vacc0x11335577, vmultiplier), vrounding);
-    const __m256i vprod1x1357 = _mm256_add_epi64(_mm256_mul_epi32(vacc1x11335577, vmultiplier), vrounding);
-    const __m256i vprod2x1357 = _mm256_add_epi64(_mm256_mul_epi32(vacc2x11335577, vmultiplier), vrounding);
-
-    const __m256i vq31prod0x0246 = _mm256_srli_epi64(vprod0x0246, 31);
-    const __m256i vq31prod0x1357 = _mm256_add_epi64(vprod0x1357, vprod0x1357);
-    const __m256i vq31prod1x0246 = _mm256_srli_epi64(vprod1x0246, 31);
-    const __m256i vq31prod1x1357 = _mm256_add_epi64(vprod1x1357, vprod1x1357);
-    const __m256i vq31prod2x0246 = _mm256_srli_epi64(vprod2x0246, 31);
-    const __m256i vq31prod2x1357 = _mm256_add_epi64(vprod2x1357, vprod2x1357);
-
-    const __m256i vq31prod0x01234567 = _mm256_blend_epi16(vq31prod0x0246, vq31prod0x1357, 0xCC);
-    const __m256i vq31prod1x01234567 = _mm256_blend_epi16(vq31prod1x0246, vq31prod1x1357, 0xCC);
-    const __m256i vq31prod2x01234567 = _mm256_blend_epi16(vq31prod2x0246, vq31prod2x1357, 0xCC);
-
-    const __m256i vremainder_mask = _mm256_load_si256((const __m256i*) params->gemmlowp_avx2.remainder_mask);
-    const __m256i vrem0x01234567 =
-      _mm256_add_epi32(_mm256_and_si256(vq31prod0x01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod0x01234567));
-    const __m256i vrem1x01234567 =
-      _mm256_add_epi32(_mm256_and_si256(vq31prod1x01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod1x01234567));
-    const __m256i vrem2x01234567 =
-      _mm256_add_epi32(_mm256_and_si256(vq31prod2x01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod2x01234567));
-
-    const __m256i vremainder_threshold = _mm256_load_si256((const __m256i*) params->gemmlowp_avx2.remainder_threshold);
-    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->gemmlowp_avx2.shift);
-    vacc0x01234567 =
-      _mm256_sub_epi32(_mm256_sra_epi32(vq31prod0x01234567, vshift), _mm256_cmpgt_epi32(vrem0x01234567, vremainder_threshold));
-    vacc1x01234567 =
-      _mm256_sub_epi32(_mm256_sra_epi32(vq31prod1x01234567, vshift), _mm256_cmpgt_epi32(vrem1x01234567, vremainder_threshold));
-    vacc2x01234567 =
-      _mm256_sub_epi32(_mm256_sra_epi32(vq31prod2x01234567, vshift), _mm256_cmpgt_epi32(vrem2x01234567, vremainder_threshold));
-
-    const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->gemmlowp_avx2.output_zero_point);
-    __m256i vacc01x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc0x01234567, vacc1x01234567), voutput_zero_point);
-    __m256i vacc22x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc2x01234567, vacc2x01234567), voutput_zero_point);
-
-    vacc01x01234567 = _mm256_permute4x64_epi64(vacc01x01234567, _MM_SHUFFLE(3, 1, 2, 0));
-    vacc22x01234567 = _mm256_permute4x64_epi64(vacc22x01234567, _MM_SHUFFLE(3, 1, 2, 0));
-
-    __m256i vout = _mm256_packs_epi16(vacc01x01234567, vacc22x01234567);
-
-    vout = _mm256_max_epi8(vout, _mm256_load_si256((const __m256i*) params->gemmlowp_avx2.output_min));
-    vout = _mm256_min_epi8(vout, _mm256_load_si256((const __m256i*) params->gemmlowp_avx2.output_max));
-
-    __m128i vout_lo = _mm256_castsi256_si128(vout);
-    __m128i vout_hi = _mm256_extracti128_si256(vout, 1);
-
-    if (nc >= 8) {
-      _mm_storel_epi64((__m128i*) c0, vout_lo);
-      _mm_storel_epi64((__m128i*) c1, vout_hi);
-      _mm_storeh_pi((__m64*) c2, _mm_castsi128_ps(vout_lo));
-
-      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
-      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
-      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
-
-      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
-      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
-
-      nc -= 8;
-    } else {
-      if (nc & 4) {
-        _mm_storeu_si32(c0, vout_lo);
-        _mm_storeu_si32(c1, vout_hi);
-        *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout_lo, 2);
-
-        c0 += 4;
-        c1 += 4;
-        c2 += 4;
-
-        vout_lo = _mm_srli_epi64(vout_lo, 32);
-        vout_hi = _mm_srli_epi64(vout_hi, 32);
-      }
-      if (nc & 2) {
-        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout_lo, 0);
-        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout_hi, 0);
-        *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout_lo, 4);
-
-        c0 += 2;
-        c1 += 2;
-        c2 += 2;
-
-        vout_lo = _mm_srli_epi32(vout_lo, 16);
-        vout_hi = _mm_srli_epi32(vout_hi, 16);
-      }
-      if (nc & 1) {
-        *c0 = (int8_t) _mm_extract_epi8(vout_lo, 0);
-        *c1 = (int8_t) _mm_extract_epi8(vout_hi, 0);
-        *c2 = (int8_t) _mm_extract_epi8(vout_lo, 8);
-      }
-
-      nc = 0;
-    }
-  } while (nc != 0);
-}
diff --git a/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-avx.c b/src/qs8-gemm/gen/4x4c2-xw-minmax-fp32-avx.c
similarity index 70%
rename from src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-avx.c
rename to src/qs8-gemm/gen/4x4c2-xw-minmax-fp32-avx.c
index c33e288..ec55f91 100644
--- a/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-avx.c
+++ b/src/qs8-gemm/gen/4x4c2-xw-minmax-fp32-avx.c
@@ -16,7 +16,7 @@
 
 
 
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__avx(
+void xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -180,68 +180,31 @@
       }
     }
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.rounding);
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+    __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
+    __m128 vscaled3x0123 = _mm_cvtepi32_ps(vacc3x0123);
 
-    const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
-    const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
-    const __m128i vacc2x1133 = _mm_shuffle_epi32(vacc2x0123, _MM_SHUFFLE(3, 3, 1, 1));
-    const __m128i vacc3x1133 = _mm_shuffle_epi32(vacc3x0123, _MM_SHUFFLE(3, 3, 1, 1));
+    const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale);
+    vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale);
+    vscaled3x0123 = _mm_mul_ps(vscaled3x0123, vscale);
 
-    const __m128i vprod0x02 = _mm_add_epi64(_mm_mul_epi32(vacc0x0123, vmultiplier), vrounding);
-    const __m128i vprod1x02 = _mm_add_epi64(_mm_mul_epi32(vacc1x0123, vmultiplier), vrounding);
-    const __m128i vprod2x02 = _mm_add_epi64(_mm_mul_epi32(vacc2x0123, vmultiplier), vrounding);
-    const __m128i vprod3x02 = _mm_add_epi64(_mm_mul_epi32(vacc3x0123, vmultiplier), vrounding);
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+    vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
+    vacc3x0123 = _mm_cvtps_epi32(vscaled3x0123);
 
-    const __m128i vprod0x13 = _mm_add_epi64(_mm_mul_epi32(vacc0x1133, vmultiplier), vrounding);
-    const __m128i vprod1x13 = _mm_add_epi64(_mm_mul_epi32(vacc1x1133, vmultiplier), vrounding);
-    const __m128i vprod2x13 = _mm_add_epi64(_mm_mul_epi32(vacc2x1133, vmultiplier), vrounding);
-    const __m128i vprod3x13 = _mm_add_epi64(_mm_mul_epi32(vacc3x1133, vmultiplier), vrounding);
-
-    const __m128i vq31prod0x02 = _mm_srli_epi64(vprod0x02, 31);
-    const __m128i vq31prod0x13 = _mm_add_epi64(vprod0x13, vprod0x13);
-    const __m128i vq31prod1x02 = _mm_srli_epi64(vprod1x02, 31);
-    const __m128i vq31prod1x13 = _mm_add_epi64(vprod1x13, vprod1x13);
-    const __m128i vq31prod2x02 = _mm_srli_epi64(vprod2x02, 31);
-    const __m128i vq31prod2x13 = _mm_add_epi64(vprod2x13, vprod2x13);
-    const __m128i vq31prod3x02 = _mm_srli_epi64(vprod3x02, 31);
-    const __m128i vq31prod3x13 = _mm_add_epi64(vprod3x13, vprod3x13);
-
-    const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
-    const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
-    const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
-    const __m128i vq31prod3x0123 = _mm_blend_epi16(vq31prod3x02, vq31prod3x13, 0xCC);
-
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_mask);
-    const __m128i vrem0x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
-    const __m128i vrem1x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
-    const __m128i vrem2x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
-    const __m128i vrem3x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod3x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod3x0123));
-
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_threshold);
-    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->gemmlowp_sse4.shift);
-    vacc0x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
-    vacc1x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
-    vacc2x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod2x0123, vshift), _mm_cmpgt_epi32(vrem2x0123, vremainder_threshold));
-    vacc3x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod3x0123, vshift), _mm_cmpgt_epi32(vrem3x0123, vremainder_threshold));
-
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
     __m128i vacc23x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc3x0123), voutput_zero_point);
 
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc23x0123);
 
-    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
-    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_max));
 
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
diff --git a/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-avx.c b/src/qs8-gemm/gen/4x4c2-xw-minmax-fp32-sse2.c
similarity index 63%
copy from src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-avx.c
copy to src/qs8-gemm/gen/4x4c2-xw-minmax-fp32-sse2.c
index c33e288..2974662 100644
--- a/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-avx.c
+++ b/src/qs8-gemm/gen/4x4c2-xw-minmax-fp32-sse2.c
@@ -9,14 +9,14 @@
 
 #include <assert.h>
 
-#include <smmintrin.h>
+#include <emmintrin.h>
 
 #include <xnnpack/gemm.h>
 #include <xnnpack/math.h>
 
 
 
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__avx(
+void xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse2(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -69,16 +69,16 @@
     size_t k = kc;
     while (k >= 8 * sizeof(int8_t)) {
       const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
-      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8);
       a0 += 8;
       const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
-      const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+      const __m128i vxa1 = _mm_srai_epi16(_mm_unpacklo_epi8(va1, va1), 8);
       a1 += 8;
       const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
-      const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
+      const __m128i vxa2 = _mm_srai_epi16(_mm_unpacklo_epi8(va2, va2), 8);
       a2 += 8;
       const __m128i va3 = _mm_loadl_epi64((const __m128i*) a3);
-      const __m128i vxa3 = _mm_cvtepi8_epi16(va3);
+      const __m128i vxa3 = _mm_srai_epi16(_mm_unpacklo_epi8(va3, va3), 8);
       a3 += 8;
 
       const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
@@ -127,16 +127,16 @@
     }
     if (k != 0) {
       const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
-      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+      const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8);
       a0 = (const int8_t*) ((uintptr_t) a0 + k);
       const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
-      const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+      const __m128i vxa1 = _mm_srai_epi16(_mm_unpacklo_epi8(va1, va1), 8);
       a1 = (const int8_t*) ((uintptr_t) a1 + k);
       const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
-      const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
+      const __m128i vxa2 = _mm_srai_epi16(_mm_unpacklo_epi8(va2, va2), 8);
       a2 = (const int8_t*) ((uintptr_t) a2 + k);
       const __m128i va3 = _mm_loadl_epi64((const __m128i*) a3);
-      const __m128i vxa3 = _mm_cvtepi8_epi16(va3);
+      const __m128i vxa3 = _mm_srai_epi16(_mm_unpacklo_epi8(va3, va3), 8);
       a3 = (const int8_t*) ((uintptr_t) a3 + k);
 
       const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
@@ -180,74 +180,42 @@
       }
     }
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.rounding);
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+    __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
+    __m128 vscaled3x0123 = _mm_cvtepi32_ps(vacc3x0123);
 
-    const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
-    const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
-    const __m128i vacc2x1133 = _mm_shuffle_epi32(vacc2x0123, _MM_SHUFFLE(3, 3, 1, 1));
-    const __m128i vacc3x1133 = _mm_shuffle_epi32(vacc3x0123, _MM_SHUFFLE(3, 3, 1, 1));
+    const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale);
+    vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale);
+    vscaled3x0123 = _mm_mul_ps(vscaled3x0123, vscale);
 
-    const __m128i vprod0x02 = _mm_add_epi64(_mm_mul_epi32(vacc0x0123, vmultiplier), vrounding);
-    const __m128i vprod1x02 = _mm_add_epi64(_mm_mul_epi32(vacc1x0123, vmultiplier), vrounding);
-    const __m128i vprod2x02 = _mm_add_epi64(_mm_mul_epi32(vacc2x0123, vmultiplier), vrounding);
-    const __m128i vprod3x02 = _mm_add_epi64(_mm_mul_epi32(vacc3x0123, vmultiplier), vrounding);
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+    vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
+    vacc3x0123 = _mm_cvtps_epi32(vscaled3x0123);
 
-    const __m128i vprod0x13 = _mm_add_epi64(_mm_mul_epi32(vacc0x1133, vmultiplier), vrounding);
-    const __m128i vprod1x13 = _mm_add_epi64(_mm_mul_epi32(vacc1x1133, vmultiplier), vrounding);
-    const __m128i vprod2x13 = _mm_add_epi64(_mm_mul_epi32(vacc2x1133, vmultiplier), vrounding);
-    const __m128i vprod3x13 = _mm_add_epi64(_mm_mul_epi32(vacc3x1133, vmultiplier), vrounding);
-
-    const __m128i vq31prod0x02 = _mm_srli_epi64(vprod0x02, 31);
-    const __m128i vq31prod0x13 = _mm_add_epi64(vprod0x13, vprod0x13);
-    const __m128i vq31prod1x02 = _mm_srli_epi64(vprod1x02, 31);
-    const __m128i vq31prod1x13 = _mm_add_epi64(vprod1x13, vprod1x13);
-    const __m128i vq31prod2x02 = _mm_srli_epi64(vprod2x02, 31);
-    const __m128i vq31prod2x13 = _mm_add_epi64(vprod2x13, vprod2x13);
-    const __m128i vq31prod3x02 = _mm_srli_epi64(vprod3x02, 31);
-    const __m128i vq31prod3x13 = _mm_add_epi64(vprod3x13, vprod3x13);
-
-    const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
-    const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
-    const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
-    const __m128i vq31prod3x0123 = _mm_blend_epi16(vq31prod3x02, vq31prod3x13, 0xCC);
-
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_mask);
-    const __m128i vrem0x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
-    const __m128i vrem1x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
-    const __m128i vrem2x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
-    const __m128i vrem3x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod3x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod3x0123));
-
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_threshold);
-    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->gemmlowp_sse4.shift);
-    vacc0x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
-    vacc1x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
-    vacc2x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod2x0123, vshift), _mm_cmpgt_epi32(vrem2x0123, vremainder_threshold));
-    vacc3x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod3x0123, vshift), _mm_cmpgt_epi32(vrem3x0123, vremainder_threshold));
-
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
     __m128i vacc23x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc3x0123), voutput_zero_point);
 
+    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
+    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->fp32_sse2.output_max);
+    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
+    vacc23x0123 = _mm_min_epi16(_mm_max_epi16(vacc23x0123, voutput_min), voutput_max);
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc23x0123);
 
-    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
-    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
 
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
-      *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
-      *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
-      *((uint32_t*) c3) = (uint32_t) _mm_extract_epi32(vout, 3);
+      vout = _mm_srli_si128(vout, 4);
+      *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(vout);
+      vout = _mm_srli_si128(vout, 4);
+      *((uint32_t*) c2) = (uint32_t) _mm_cvtsi128_si32(vout);
+      vout = _mm_srli_si128(vout, 4);
+      *((uint32_t*) c3) = (uint32_t) _mm_cvtsi128_si32(vout);
 
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
@@ -273,10 +241,10 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
-        *c1 = (int8_t) _mm_extract_epi8(vout, 4);
-        *c2 = (int8_t) _mm_extract_epi8(vout, 8);
-        *c3 = (int8_t) _mm_extract_epi8(vout, 12);
+        *c0 = (int8_t) _mm_cvtsi128_si32(vout);
+        *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+        *c2 = (int8_t) _mm_extract_epi16(vout, 4);
+        *c3 = (int8_t) _mm_extract_epi16(vout, 6);
       }
 
       nc = 0;
diff --git a/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-avx.c b/src/qs8-gemm/gen/4x4c2-xw-minmax-fp32-sse41.c
similarity index 70%
copy from src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-avx.c
copy to src/qs8-gemm/gen/4x4c2-xw-minmax-fp32-sse41.c
index c33e288..ca8b6d9 100644
--- a/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-avx.c
+++ b/src/qs8-gemm/gen/4x4c2-xw-minmax-fp32-sse41.c
@@ -16,7 +16,7 @@
 
 
 
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__avx(
+void xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -180,68 +180,31 @@
       }
     }
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.rounding);
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+    __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
+    __m128 vscaled3x0123 = _mm_cvtepi32_ps(vacc3x0123);
 
-    const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
-    const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
-    const __m128i vacc2x1133 = _mm_shuffle_epi32(vacc2x0123, _MM_SHUFFLE(3, 3, 1, 1));
-    const __m128i vacc3x1133 = _mm_shuffle_epi32(vacc3x0123, _MM_SHUFFLE(3, 3, 1, 1));
+    const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale);
+    vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale);
+    vscaled3x0123 = _mm_mul_ps(vscaled3x0123, vscale);
 
-    const __m128i vprod0x02 = _mm_add_epi64(_mm_mul_epi32(vacc0x0123, vmultiplier), vrounding);
-    const __m128i vprod1x02 = _mm_add_epi64(_mm_mul_epi32(vacc1x0123, vmultiplier), vrounding);
-    const __m128i vprod2x02 = _mm_add_epi64(_mm_mul_epi32(vacc2x0123, vmultiplier), vrounding);
-    const __m128i vprod3x02 = _mm_add_epi64(_mm_mul_epi32(vacc3x0123, vmultiplier), vrounding);
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+    vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
+    vacc3x0123 = _mm_cvtps_epi32(vscaled3x0123);
 
-    const __m128i vprod0x13 = _mm_add_epi64(_mm_mul_epi32(vacc0x1133, vmultiplier), vrounding);
-    const __m128i vprod1x13 = _mm_add_epi64(_mm_mul_epi32(vacc1x1133, vmultiplier), vrounding);
-    const __m128i vprod2x13 = _mm_add_epi64(_mm_mul_epi32(vacc2x1133, vmultiplier), vrounding);
-    const __m128i vprod3x13 = _mm_add_epi64(_mm_mul_epi32(vacc3x1133, vmultiplier), vrounding);
-
-    const __m128i vq31prod0x02 = _mm_srli_epi64(vprod0x02, 31);
-    const __m128i vq31prod0x13 = _mm_add_epi64(vprod0x13, vprod0x13);
-    const __m128i vq31prod1x02 = _mm_srli_epi64(vprod1x02, 31);
-    const __m128i vq31prod1x13 = _mm_add_epi64(vprod1x13, vprod1x13);
-    const __m128i vq31prod2x02 = _mm_srli_epi64(vprod2x02, 31);
-    const __m128i vq31prod2x13 = _mm_add_epi64(vprod2x13, vprod2x13);
-    const __m128i vq31prod3x02 = _mm_srli_epi64(vprod3x02, 31);
-    const __m128i vq31prod3x13 = _mm_add_epi64(vprod3x13, vprod3x13);
-
-    const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
-    const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
-    const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
-    const __m128i vq31prod3x0123 = _mm_blend_epi16(vq31prod3x02, vq31prod3x13, 0xCC);
-
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_mask);
-    const __m128i vrem0x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
-    const __m128i vrem1x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
-    const __m128i vrem2x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
-    const __m128i vrem3x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod3x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod3x0123));
-
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_threshold);
-    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->gemmlowp_sse4.shift);
-    vacc0x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
-    vacc1x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
-    vacc2x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod2x0123, vshift), _mm_cmpgt_epi32(vrem2x0123, vremainder_threshold));
-    vacc3x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod3x0123, vshift), _mm_cmpgt_epi32(vrem3x0123, vremainder_threshold));
-
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
     __m128i vacc23x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc3x0123), voutput_zero_point);
 
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc23x0123);
 
-    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
-    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_max));
 
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
diff --git a/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-ssse3.c b/src/qs8-gemm/gen/4x4c2-xw-minmax-fp32-ssse3.c
similarity index 100%
rename from src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-ssse3.c
rename to src/qs8-gemm/gen/4x4c2-xw-minmax-fp32-ssse3.c
diff --git a/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-xop.c b/src/qs8-gemm/gen/4x4c2-xw-minmax-fp32-xop.c
similarity index 70%
rename from src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-xop.c
rename to src/qs8-gemm/gen/4x4c2-xw-minmax-fp32-xop.c
index 462b459..943403e 100644
--- a/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-xop.c
+++ b/src/qs8-gemm/gen/4x4c2-xw-minmax-fp32-xop.c
@@ -21,7 +21,7 @@
 
 
 
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__xop(
+void xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -185,68 +185,31 @@
       }
     }
 
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.rounding);
+    __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+    __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+    __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
+    __m128 vscaled3x0123 = _mm_cvtepi32_ps(vacc3x0123);
 
-    const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
-    const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
-    const __m128i vacc2x1133 = _mm_shuffle_epi32(vacc2x0123, _MM_SHUFFLE(3, 3, 1, 1));
-    const __m128i vacc3x1133 = _mm_shuffle_epi32(vacc3x0123, _MM_SHUFFLE(3, 3, 1, 1));
+    const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
+    vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
+    vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale);
+    vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale);
+    vscaled3x0123 = _mm_mul_ps(vscaled3x0123, vscale);
 
-    const __m128i vprod0x02 = _mm_add_epi64(_mm_mul_epi32(vacc0x0123, vmultiplier), vrounding);
-    const __m128i vprod1x02 = _mm_add_epi64(_mm_mul_epi32(vacc1x0123, vmultiplier), vrounding);
-    const __m128i vprod2x02 = _mm_add_epi64(_mm_mul_epi32(vacc2x0123, vmultiplier), vrounding);
-    const __m128i vprod3x02 = _mm_add_epi64(_mm_mul_epi32(vacc3x0123, vmultiplier), vrounding);
+    vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+    vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+    vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
+    vacc3x0123 = _mm_cvtps_epi32(vscaled3x0123);
 
-    const __m128i vprod0x13 = _mm_add_epi64(_mm_mul_epi32(vacc0x1133, vmultiplier), vrounding);
-    const __m128i vprod1x13 = _mm_add_epi64(_mm_mul_epi32(vacc1x1133, vmultiplier), vrounding);
-    const __m128i vprod2x13 = _mm_add_epi64(_mm_mul_epi32(vacc2x1133, vmultiplier), vrounding);
-    const __m128i vprod3x13 = _mm_add_epi64(_mm_mul_epi32(vacc3x1133, vmultiplier), vrounding);
-
-    const __m128i vq31prod0x02 = _mm_srli_epi64(vprod0x02, 31);
-    const __m128i vq31prod0x13 = _mm_add_epi64(vprod0x13, vprod0x13);
-    const __m128i vq31prod1x02 = _mm_srli_epi64(vprod1x02, 31);
-    const __m128i vq31prod1x13 = _mm_add_epi64(vprod1x13, vprod1x13);
-    const __m128i vq31prod2x02 = _mm_srli_epi64(vprod2x02, 31);
-    const __m128i vq31prod2x13 = _mm_add_epi64(vprod2x13, vprod2x13);
-    const __m128i vq31prod3x02 = _mm_srli_epi64(vprod3x02, 31);
-    const __m128i vq31prod3x13 = _mm_add_epi64(vprod3x13, vprod3x13);
-
-    const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
-    const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
-    const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
-    const __m128i vq31prod3x0123 = _mm_blend_epi16(vq31prod3x02, vq31prod3x13, 0xCC);
-
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_mask);
-    const __m128i vrem0x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
-    const __m128i vrem1x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
-    const __m128i vrem2x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
-    const __m128i vrem3x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod3x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod3x0123));
-
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_threshold);
-    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->gemmlowp_sse4.shift);
-    vacc0x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
-    vacc1x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
-    vacc2x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod2x0123, vshift), _mm_cmpgt_epi32(vrem2x0123, vremainder_threshold));
-    vacc3x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod3x0123, vshift), _mm_cmpgt_epi32(vrem3x0123, vremainder_threshold));
-
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_zero_point);
+    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
     __m128i vacc23x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc3x0123), voutput_zero_point);
 
 
     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc23x0123);
 
-    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
-    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
+    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
+    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_max));
 
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
diff --git a/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-sse2.c b/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-sse2.c
deleted file mode 100644
index 488e22d..0000000
--- a/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-sse2.c
+++ /dev/null
@@ -1,330 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/qs8-gemm/MRx4c2-sse.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <emmintrin.h>
-
-#include <xnnpack/gemm.h>
-#include <xnnpack/math.h>
-
-
-
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse2(
-    size_t mr,
-    size_t nc,
-    size_t kc,
-    const int8_t* restrict a,
-    size_t a_stride,
-    const void* restrict w,
-    int8_t* restrict c,
-    size_t cm_stride,
-    size_t cn_stride,
-    const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
-{
-  assert(mr != 0);
-  assert(mr <= 4);
-  assert(nc != 0);
-  assert(kc != 0);
-  assert(kc % sizeof(int8_t) == 0);
-  assert(a != NULL);
-  assert(w != NULL);
-  assert(c != NULL);
-
-  kc = round_up_po2(kc, 2);
-  const int8_t* a0 = a;
-  int8_t* c0 = c;
-  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
-  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
-  if XNN_UNPREDICTABLE(mr < 2) {
-    a1 = a0;
-    c1 = c0;
-  }
-  const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
-  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
-  if XNN_UNPREDICTABLE(mr <= 2) {
-    a2 = a1;
-    c2 = c1;
-  }
-  const int8_t* a3 = (const int8_t*) ((uintptr_t) a2 + a_stride);
-  int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride);
-  if XNN_UNPREDICTABLE(mr != 4) {
-    a3 = a2;
-    c3 = c2;
-  }
-
-  do {
-    __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
-    __m128i vacc1x0123 = vacc0x0123;
-    __m128i vacc2x0123 = vacc0x0123;
-    __m128i vacc3x0123 = vacc0x0123;
-    w = (const void*) ((const int32_t*) w + 4);
-
-    size_t k = kc;
-    while (k >= 8 * sizeof(int8_t)) {
-      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
-      const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8);
-      a0 += 8;
-      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
-      const __m128i vxa1 = _mm_srai_epi16(_mm_unpacklo_epi8(va1, va1), 8);
-      a1 += 8;
-      const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
-      const __m128i vxa2 = _mm_srai_epi16(_mm_unpacklo_epi8(va2, va2), 8);
-      a2 += 8;
-      const __m128i va3 = _mm_loadl_epi64((const __m128i*) a3);
-      const __m128i vxa3 = _mm_srai_epi16(_mm_unpacklo_epi8(va3, va3), 8);
-      a3 += 8;
-
-      const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
-
-      vacc0x0123 = _mm_add_epi32(vacc0x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
-      vacc1x0123 = _mm_add_epi32(vacc1x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
-      vacc2x0123 = _mm_add_epi32(vacc2x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
-      vacc3x0123 = _mm_add_epi32(vacc3x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
-      const __m128i vxb1 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 8));
-
-      vacc0x0123 = _mm_add_epi32(vacc0x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
-      vacc1x0123 = _mm_add_epi32(vacc1x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
-      vacc2x0123 = _mm_add_epi32(vacc2x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
-      vacc3x0123 = _mm_add_epi32(vacc3x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
-      const __m128i vxb2 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 16));
-
-      vacc0x0123 = _mm_add_epi32(vacc0x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-      vacc1x0123 = _mm_add_epi32(vacc1x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-      vacc2x0123 = _mm_add_epi32(vacc2x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-      vacc3x0123 = _mm_add_epi32(vacc3x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-      const __m128i vxb3 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 24));
-
-      vacc0x0123 = _mm_add_epi32(vacc0x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-      vacc1x0123 = _mm_add_epi32(vacc1x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-      vacc2x0123 = _mm_add_epi32(vacc2x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-      vacc3x0123 = _mm_add_epi32(vacc3x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-
-      w = (const void*) ((const int16_t*) w + 32);
-      k -= 8 * sizeof(int8_t);
-    }
-    if (k != 0) {
-      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
-      const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8);
-      a0 = (const int8_t*) ((uintptr_t) a0 + k);
-      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
-      const __m128i vxa1 = _mm_srai_epi16(_mm_unpacklo_epi8(va1, va1), 8);
-      a1 = (const int8_t*) ((uintptr_t) a1 + k);
-      const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
-      const __m128i vxa2 = _mm_srai_epi16(_mm_unpacklo_epi8(va2, va2), 8);
-      a2 = (const int8_t*) ((uintptr_t) a2 + k);
-      const __m128i va3 = _mm_loadl_epi64((const __m128i*) a3);
-      const __m128i vxa3 = _mm_srai_epi16(_mm_unpacklo_epi8(va3, va3), 8);
-      a3 = (const int8_t*) ((uintptr_t) a3 + k);
-
-      const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
-      w = (const void*) ((const int16_t*) w + 8);
-
-      vacc0x0123 = _mm_add_epi32(vacc0x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
-      vacc1x0123 = _mm_add_epi32(vacc1x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
-      vacc2x0123 = _mm_add_epi32(vacc2x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
-      vacc3x0123 = _mm_add_epi32(vacc3x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
-
-      if (k > 2 * sizeof(int8_t)) {
-        const __m128i vxb1 = _mm_load_si128((const __m128i*) w);
-        w = (const void*) ((const int16_t*) w + 8);
-
-        vacc0x0123 = _mm_add_epi32(vacc0x0123,
-          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
-        vacc1x0123 = _mm_add_epi32(vacc1x0123,
-          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
-        vacc2x0123 = _mm_add_epi32(vacc2x0123,
-          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
-        vacc3x0123 = _mm_add_epi32(vacc3x0123,
-          _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
-
-        if (k > 4 * sizeof(int8_t)) {
-          const __m128i vxb2 = _mm_load_si128((const __m128i*) w);
-          w = (const void*) ((const int16_t*) w + 8);
-
-          vacc0x0123 = _mm_add_epi32(vacc0x0123,
-            _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-          vacc1x0123 = _mm_add_epi32(vacc1x0123,
-            _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-          vacc2x0123 = _mm_add_epi32(vacc2x0123,
-            _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-          vacc3x0123 = _mm_add_epi32(vacc3x0123,
-            _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-        }
-      }
-    }
-
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.rounding);
-
-    const __m128i vnmask0x0123 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc0x0123);
-    const __m128i vnmask1x0123 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc1x0123);
-    const __m128i vnmask2x0123 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc2x0123);
-    const __m128i vnmask3x0123 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc3x0123);
-
-    const __m128i vabsacc0x0123 = _mm_sub_epi32(_mm_xor_si128(vacc0x0123, vnmask0x0123), vnmask0x0123);
-    const __m128i vabsacc1x0123 = _mm_sub_epi32(_mm_xor_si128(vacc1x0123, vnmask1x0123), vnmask1x0123);
-    const __m128i vabsacc2x0123 = _mm_sub_epi32(_mm_xor_si128(vacc2x0123, vnmask2x0123), vnmask2x0123);
-    const __m128i vabsacc3x0123 = _mm_sub_epi32(_mm_xor_si128(vacc3x0123, vnmask3x0123), vnmask3x0123);
-
-    const __m128i vabsacc0x1133 = _mm_shuffle_epi32(vabsacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
-    const __m128i vabsacc1x1133 = _mm_shuffle_epi32(vabsacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
-    const __m128i vabsacc2x1133 = _mm_shuffle_epi32(vabsacc2x0123, _MM_SHUFFLE(3, 3, 1, 1));
-    const __m128i vabsacc3x1133 = _mm_shuffle_epi32(vabsacc3x0123, _MM_SHUFFLE(3, 3, 1, 1));
-
-    const __m128i vabsprod0x02 = _mm_mul_epu32(vabsacc0x0123, vmultiplier);
-    const __m128i vabsprod1x02 = _mm_mul_epu32(vabsacc1x0123, vmultiplier);
-    const __m128i vabsprod2x02 = _mm_mul_epu32(vabsacc2x0123, vmultiplier);
-    const __m128i vabsprod3x02 = _mm_mul_epu32(vabsacc3x0123, vmultiplier);
-
-    const __m128i vnmask0x02 = _mm_shuffle_epi32(vnmask0x0123, _MM_SHUFFLE(2, 2, 0, 0));
-    const __m128i vnmask1x02 = _mm_shuffle_epi32(vnmask1x0123, _MM_SHUFFLE(2, 2, 0, 0));
-    const __m128i vnmask2x02 = _mm_shuffle_epi32(vnmask2x0123, _MM_SHUFFLE(2, 2, 0, 0));
-    const __m128i vnmask3x02 = _mm_shuffle_epi32(vnmask3x0123, _MM_SHUFFLE(2, 2, 0, 0));
-
-    const __m128i vprod0x02 = _mm_sub_epi64(_mm_xor_si128(vabsprod0x02, vnmask0x02), vnmask0x02);
-    const __m128i vprod1x02 = _mm_sub_epi64(_mm_xor_si128(vabsprod1x02, vnmask1x02), vnmask1x02);
-    const __m128i vprod2x02 = _mm_sub_epi64(_mm_xor_si128(vabsprod2x02, vnmask2x02), vnmask2x02);
-    const __m128i vprod3x02 = _mm_sub_epi64(_mm_xor_si128(vabsprod3x02, vnmask3x02), vnmask3x02);
-
-    const __m128i vq31prod0x02 = _mm_srli_epi64(_mm_add_epi64(vprod0x02, vrounding), 31);
-    const __m128i vq31prod1x02 = _mm_srli_epi64(_mm_add_epi64(vprod1x02, vrounding), 31);
-    const __m128i vq31prod2x02 = _mm_srli_epi64(_mm_add_epi64(vprod2x02, vrounding), 31);
-    const __m128i vq31prod3x02 = _mm_srli_epi64(_mm_add_epi64(vprod3x02, vrounding), 31);
-
-    const __m128i vabsprod0x13 = _mm_mul_epu32(vabsacc0x1133, vmultiplier);
-    const __m128i vabsprod1x13 = _mm_mul_epu32(vabsacc1x1133, vmultiplier);
-    const __m128i vabsprod2x13 = _mm_mul_epu32(vabsacc2x1133, vmultiplier);
-    const __m128i vabsprod3x13 = _mm_mul_epu32(vabsacc3x1133, vmultiplier);
-
-    const __m128i vnmask0x13 = _mm_shuffle_epi32(vnmask0x0123, _MM_SHUFFLE(3, 3, 1, 1));
-    const __m128i vnmask1x13 = _mm_shuffle_epi32(vnmask1x0123, _MM_SHUFFLE(3, 3, 1, 1));
-    const __m128i vnmask2x13 = _mm_shuffle_epi32(vnmask2x0123, _MM_SHUFFLE(3, 3, 1, 1));
-    const __m128i vnmask3x13 = _mm_shuffle_epi32(vnmask3x0123, _MM_SHUFFLE(3, 3, 1, 1));
-
-    const __m128i vprod0x13 = _mm_sub_epi64(_mm_xor_si128(vabsprod0x13, vnmask0x13), vnmask0x13);
-    const __m128i vprod1x13 = _mm_sub_epi64(_mm_xor_si128(vabsprod1x13, vnmask1x13), vnmask1x13);
-    const __m128i vprod2x13 = _mm_sub_epi64(_mm_xor_si128(vabsprod2x13, vnmask2x13), vnmask2x13);
-    const __m128i vprod3x13 = _mm_sub_epi64(_mm_xor_si128(vabsprod3x13, vnmask3x13), vnmask3x13);
-
-    const __m128i vq31prod0x13 = _mm_srli_epi64(_mm_add_epi64(vprod0x13, vrounding), 31);
-    const __m128i vq31prod1x13 = _mm_srli_epi64(_mm_add_epi64(vprod1x13, vrounding), 31);
-    const __m128i vq31prod2x13 = _mm_srli_epi64(_mm_add_epi64(vprod2x13, vrounding), 31);
-    const __m128i vq31prod3x13 = _mm_srli_epi64(_mm_add_epi64(vprod3x13, vrounding), 31);
-
-    const __m128i vq31prod0x0213 = _mm_castps_si128(_mm_shuffle_ps(
-        _mm_castsi128_ps(vq31prod0x02), _mm_castsi128_ps(vq31prod0x13), _MM_SHUFFLE(2, 0, 2, 0)));
-    const __m128i vq31prod1x0213 = _mm_castps_si128(_mm_shuffle_ps(
-        _mm_castsi128_ps(vq31prod1x02), _mm_castsi128_ps(vq31prod1x13), _MM_SHUFFLE(2, 0, 2, 0)));
-    const __m128i vq31prod2x0213 = _mm_castps_si128(_mm_shuffle_ps(
-        _mm_castsi128_ps(vq31prod2x02), _mm_castsi128_ps(vq31prod2x13), _MM_SHUFFLE(2, 0, 2, 0)));
-    const __m128i vq31prod3x0213 = _mm_castps_si128(_mm_shuffle_ps(
-        _mm_castsi128_ps(vq31prod3x02), _mm_castsi128_ps(vq31prod3x13), _MM_SHUFFLE(2, 0, 2, 0)));
-
-    const __m128i vq31prod0x0123 = _mm_shuffle_epi32(vq31prod0x0213, _MM_SHUFFLE(3, 1, 2, 0));
-    const __m128i vq31prod1x0123 = _mm_shuffle_epi32(vq31prod1x0213, _MM_SHUFFLE(3, 1, 2, 0));
-    const __m128i vq31prod2x0123 = _mm_shuffle_epi32(vq31prod2x0213, _MM_SHUFFLE(3, 1, 2, 0));
-    const __m128i vq31prod3x0123 = _mm_shuffle_epi32(vq31prod3x0213, _MM_SHUFFLE(3, 1, 2, 0));
-
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.remainder_mask);
-    const __m128i vrem0x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
-    const __m128i vrem1x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
-    const __m128i vrem2x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
-    const __m128i vrem3x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod3x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod3x0123));
-
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.remainder_threshold);
-    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->gemmlowp_sse2.shift);
-    vacc0x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
-    vacc1x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
-    vacc2x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod2x0123, vshift), _mm_cmpgt_epi32(vrem2x0123, vremainder_threshold));
-    vacc3x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod3x0123, vshift), _mm_cmpgt_epi32(vrem3x0123, vremainder_threshold));
-
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_zero_point);
-    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
-    __m128i vacc23x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc3x0123), voutput_zero_point);
-
-    const __m128i voutput_min = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_min);
-    const __m128i voutput_max = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_max);
-    vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
-    vacc23x0123 = _mm_min_epi16(_mm_max_epi16(vacc23x0123, voutput_min), voutput_max);
-
-    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc23x0123);
-
-
-    if (nc >= 4) {
-      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
-      vout = _mm_srli_si128(vout, 4);
-      *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(vout);
-      vout = _mm_srli_si128(vout, 4);
-      *((uint32_t*) c2) = (uint32_t) _mm_cvtsi128_si32(vout);
-      vout = _mm_srli_si128(vout, 4);
-      *((uint32_t*) c3) = (uint32_t) _mm_cvtsi128_si32(vout);
-
-      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
-      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
-      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
-      c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
-
-      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
-      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
-      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
-
-      nc -= 4;
-    } else {
-      if (nc & 2) {
-        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
-        c0 += 2;
-        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
-        c1 += 2;
-        *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4);
-        c2 += 2;
-        *((uint16_t*) c3) = (uint16_t) _mm_extract_epi16(vout, 6);
-        c3 += 2;
-        vout = _mm_srli_epi32(vout, 16);
-      }
-      if (nc & 1) {
-        *c0 = (int8_t) _mm_cvtsi128_si32(vout);
-        *c1 = (int8_t) _mm_extract_epi16(vout, 2);
-        *c2 = (int8_t) _mm_extract_epi16(vout, 4);
-        *c3 = (int8_t) _mm_extract_epi16(vout, 6);
-      }
-
-      nc = 0;
-    }
-  } while (nc != 0);
-}
diff --git a/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-sse41.c b/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-sse41.c
deleted file mode 100644
index 6aaba0d..0000000
--- a/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-sse41.c
+++ /dev/null
@@ -1,285 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/qs8-gemm/MRx4c2-sse.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <smmintrin.h>
-
-#include <xnnpack/gemm.h>
-#include <xnnpack/math.h>
-
-
-
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse41(
-    size_t mr,
-    size_t nc,
-    size_t kc,
-    const int8_t* restrict a,
-    size_t a_stride,
-    const void* restrict w,
-    int8_t* restrict c,
-    size_t cm_stride,
-    size_t cn_stride,
-    const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
-{
-  assert(mr != 0);
-  assert(mr <= 4);
-  assert(nc != 0);
-  assert(kc != 0);
-  assert(kc % sizeof(int8_t) == 0);
-  assert(a != NULL);
-  assert(w != NULL);
-  assert(c != NULL);
-
-  kc = round_up_po2(kc, 2);
-  const int8_t* a0 = a;
-  int8_t* c0 = c;
-  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
-  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
-  if XNN_UNPREDICTABLE(mr < 2) {
-    a1 = a0;
-    c1 = c0;
-  }
-  const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
-  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
-  if XNN_UNPREDICTABLE(mr <= 2) {
-    a2 = a1;
-    c2 = c1;
-  }
-  const int8_t* a3 = (const int8_t*) ((uintptr_t) a2 + a_stride);
-  int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride);
-  if XNN_UNPREDICTABLE(mr != 4) {
-    a3 = a2;
-    c3 = c2;
-  }
-
-  do {
-    __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
-    __m128i vacc1x0123 = vacc0x0123;
-    __m128i vacc2x0123 = vacc0x0123;
-    __m128i vacc3x0123 = vacc0x0123;
-    w = (const void*) ((const int32_t*) w + 4);
-
-    size_t k = kc;
-    while (k >= 8 * sizeof(int8_t)) {
-      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
-      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
-      a0 += 8;
-      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
-      const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
-      a1 += 8;
-      const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
-      const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
-      a2 += 8;
-      const __m128i va3 = _mm_loadl_epi64((const __m128i*) a3);
-      const __m128i vxa3 = _mm_cvtepi8_epi16(va3);
-      a3 += 8;
-
-      const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
-
-      vacc0x0123 = _mm_add_epi32(vacc0x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
-      vacc1x0123 = _mm_add_epi32(vacc1x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
-      vacc2x0123 = _mm_add_epi32(vacc2x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
-      vacc3x0123 = _mm_add_epi32(vacc3x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
-      const __m128i vxb1 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 8));
-
-      vacc0x0123 = _mm_add_epi32(vacc0x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
-      vacc1x0123 = _mm_add_epi32(vacc1x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
-      vacc2x0123 = _mm_add_epi32(vacc2x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
-      vacc3x0123 = _mm_add_epi32(vacc3x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
-      const __m128i vxb2 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 16));
-
-      vacc0x0123 = _mm_add_epi32(vacc0x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-      vacc1x0123 = _mm_add_epi32(vacc1x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-      vacc2x0123 = _mm_add_epi32(vacc2x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-      vacc3x0123 = _mm_add_epi32(vacc3x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-      const __m128i vxb3 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 24));
-
-      vacc0x0123 = _mm_add_epi32(vacc0x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-      vacc1x0123 = _mm_add_epi32(vacc1x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-      vacc2x0123 = _mm_add_epi32(vacc2x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-      vacc3x0123 = _mm_add_epi32(vacc3x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-
-      w = (const void*) ((const int16_t*) w + 32);
-      k -= 8 * sizeof(int8_t);
-    }
-    if (k != 0) {
-      const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
-      const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
-      a0 = (const int8_t*) ((uintptr_t) a0 + k);
-      const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
-      const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
-      a1 = (const int8_t*) ((uintptr_t) a1 + k);
-      const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
-      const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
-      a2 = (const int8_t*) ((uintptr_t) a2 + k);
-      const __m128i va3 = _mm_loadl_epi64((const __m128i*) a3);
-      const __m128i vxa3 = _mm_cvtepi8_epi16(va3);
-      a3 = (const int8_t*) ((uintptr_t) a3 + k);
-
-      const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
-      w = (const void*) ((const int16_t*) w + 8);
-
-      vacc0x0123 = _mm_add_epi32(vacc0x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
-      vacc1x0123 = _mm_add_epi32(vacc1x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
-      vacc2x0123 = _mm_add_epi32(vacc2x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
-      vacc3x0123 = _mm_add_epi32(vacc3x0123,
-        _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
-
-      if (k > 2 * sizeof(int8_t)) {
-        const __m128i vxb1 = _mm_load_si128((const __m128i*) w);
-        w = (const void*) ((const int16_t*) w + 8);
-
-        vacc0x0123 = _mm_add_epi32(vacc0x0123,
-          _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
-        vacc1x0123 = _mm_add_epi32(vacc1x0123,
-          _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
-        vacc2x0123 = _mm_add_epi32(vacc2x0123,
-          _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
-        vacc3x0123 = _mm_add_epi32(vacc3x0123,
-          _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
-
-        if (k > 4 * sizeof(int8_t)) {
-          const __m128i vxb2 = _mm_load_si128((const __m128i*) w);
-          w = (const void*) ((const int16_t*) w + 8);
-
-          vacc0x0123 = _mm_add_epi32(vacc0x0123,
-            _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-          vacc1x0123 = _mm_add_epi32(vacc1x0123,
-            _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-          vacc2x0123 = _mm_add_epi32(vacc2x0123,
-            _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-          vacc3x0123 = _mm_add_epi32(vacc3x0123,
-            _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-        }
-      }
-    }
-
-    const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.multiplier);
-    const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.rounding);
-
-    const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
-    const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
-    const __m128i vacc2x1133 = _mm_shuffle_epi32(vacc2x0123, _MM_SHUFFLE(3, 3, 1, 1));
-    const __m128i vacc3x1133 = _mm_shuffle_epi32(vacc3x0123, _MM_SHUFFLE(3, 3, 1, 1));
-
-    const __m128i vprod0x02 = _mm_add_epi64(_mm_mul_epi32(vacc0x0123, vmultiplier), vrounding);
-    const __m128i vprod1x02 = _mm_add_epi64(_mm_mul_epi32(vacc1x0123, vmultiplier), vrounding);
-    const __m128i vprod2x02 = _mm_add_epi64(_mm_mul_epi32(vacc2x0123, vmultiplier), vrounding);
-    const __m128i vprod3x02 = _mm_add_epi64(_mm_mul_epi32(vacc3x0123, vmultiplier), vrounding);
-
-    const __m128i vprod0x13 = _mm_add_epi64(_mm_mul_epi32(vacc0x1133, vmultiplier), vrounding);
-    const __m128i vprod1x13 = _mm_add_epi64(_mm_mul_epi32(vacc1x1133, vmultiplier), vrounding);
-    const __m128i vprod2x13 = _mm_add_epi64(_mm_mul_epi32(vacc2x1133, vmultiplier), vrounding);
-    const __m128i vprod3x13 = _mm_add_epi64(_mm_mul_epi32(vacc3x1133, vmultiplier), vrounding);
-
-    const __m128i vq31prod0x02 = _mm_srli_epi64(vprod0x02, 31);
-    const __m128i vq31prod0x13 = _mm_add_epi64(vprod0x13, vprod0x13);
-    const __m128i vq31prod1x02 = _mm_srli_epi64(vprod1x02, 31);
-    const __m128i vq31prod1x13 = _mm_add_epi64(vprod1x13, vprod1x13);
-    const __m128i vq31prod2x02 = _mm_srli_epi64(vprod2x02, 31);
-    const __m128i vq31prod2x13 = _mm_add_epi64(vprod2x13, vprod2x13);
-    const __m128i vq31prod3x02 = _mm_srli_epi64(vprod3x02, 31);
-    const __m128i vq31prod3x13 = _mm_add_epi64(vprod3x13, vprod3x13);
-
-    const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
-    const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
-    const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
-    const __m128i vq31prod3x0123 = _mm_blend_epi16(vq31prod3x02, vq31prod3x13, 0xCC);
-
-    const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_mask);
-    const __m128i vrem0x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
-    const __m128i vrem1x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
-    const __m128i vrem2x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
-    const __m128i vrem3x0123 =
-      _mm_add_epi32(_mm_and_si128(vq31prod3x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod3x0123));
-
-    const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_threshold);
-    const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->gemmlowp_sse4.shift);
-    vacc0x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
-    vacc1x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
-    vacc2x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod2x0123, vshift), _mm_cmpgt_epi32(vrem2x0123, vremainder_threshold));
-    vacc3x0123 =
-      _mm_sub_epi32(_mm_sra_epi32(vq31prod3x0123, vshift), _mm_cmpgt_epi32(vrem3x0123, vremainder_threshold));
-
-    const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_zero_point);
-    __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
-    __m128i vacc23x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc3x0123), voutput_zero_point);
-
-
-    __m128i vout = _mm_packs_epi16(vacc01x0123, vacc23x0123);
-
-    vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
-    vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
-
-    if (nc >= 4) {
-      *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
-      *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
-      *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
-      *((uint32_t*) c3) = (uint32_t) _mm_extract_epi32(vout, 3);
-
-      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
-      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
-      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
-      c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
-
-      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
-      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
-      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
-
-      nc -= 4;
-    } else {
-      if (nc & 2) {
-        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
-        c0 += 2;
-        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
-        c1 += 2;
-        *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4);
-        c2 += 2;
-        *((uint16_t*) c3) = (uint16_t) _mm_extract_epi16(vout, 6);
-        c3 += 2;
-        vout = _mm_srli_epi32(vout, 16);
-      }
-      if (nc & 1) {
-        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
-        *c1 = (int8_t) _mm_extract_epi8(vout, 4);
-        *c2 = (int8_t) _mm_extract_epi8(vout, 8);
-        *c3 = (int8_t) _mm_extract_epi8(vout, 12);
-      }
-
-      nc = 0;
-    }
-  } while (nc != 0);
-}
diff --git a/src/xnnpack/gemm.h b/src/xnnpack/gemm.h
index 83739f5..99d15d1 100644
--- a/src/xnnpack/gemm.h
+++ b/src/xnnpack/gemm.h
@@ -938,50 +938,45 @@
 DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128)
 DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128)
 
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse2)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c2__sse2)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c2__sse2)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse2)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse2)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse2)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse2)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse2)
 
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse2)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse2)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse2)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse2)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse2)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse2)
 
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__ssse3)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c2__ssse3)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c2__ssse3)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__ssse3)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__ssse3)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__ssse3)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__ssse3)
 
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__ssse3)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__ssse3)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__ssse3)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse41)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse41)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse41)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41)
 
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse41)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c2__sse41)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c2__sse41)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse41)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse41)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse41)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse41)
 
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse41)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse41)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse41)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__avx)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__avx)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__avx)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx)
 
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__avx)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c2__avx)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c2__avx)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__avx)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__avx)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__avx)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__avx)
 
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__avx)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__avx)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__avx)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__xop)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__xop)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__xop)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop)
 
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__xop)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c2__xop)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c2__xop)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__xop)
-
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__xop)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__xop)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__xop)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__xop)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__xop)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__xop)
 
 DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__avx2)
 DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__avx2)
@@ -991,10 +986,6 @@
 DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__avx2)
 DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_fp32_ukernel_3x8c8__avx2)
 
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x8c8__avx2)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x8c8__avx2)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x8c8__avx2)
-
 DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x8c8__avx2)
 DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x8c8__avx2)
 DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x8c8__avx2)
diff --git a/test/qs8-gemm-minmax-fp32.cc b/test/qs8-gemm-minmax-fp32.cc
index bd2e271..7ccf6d1 100644
--- a/test/qs8-gemm-minmax-fp32.cc
+++ b/test/qs8-gemm-minmax-fp32.cc
@@ -45623,6 +45623,14049 @@
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE2, k_eq_8) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE2, strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE2, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE2, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE2, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE2, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE2, k_lt_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE2, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE2, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE2, k_gt_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE2, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE2, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE2, k_div_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE2, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE2, k_div_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE2, n_gt_4) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE2, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE2, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE2, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE2, n_div_4) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE2, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE2, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE2, n_div_4_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE2, strided_cm_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE2, strided_cm) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE2, k_eq_8) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE2, strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE2, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE2, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE2, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE2, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE2, k_lt_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE2, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE2, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE2, k_gt_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE2, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE2, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE2, k_div_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE2, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE2, k_div_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE2, n_gt_4) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE2, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE2, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE2, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE2, n_div_4) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE2, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE2, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE2, n_div_4_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE2, strided_cm_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE2, strided_cm) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE2, k_eq_8) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE2, strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE2, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE2, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE2, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t m = 1; m <= 3; m++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE2, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE2, k_lt_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE2, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE2, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE2, k_gt_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE2, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE2, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE2, k_div_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE2, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE2, k_div_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE2, n_gt_4) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE2, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE2, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE2, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE2, n_div_4) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE2, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE2, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE2, n_div_4_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE2, strided_cm_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE2, strided_cm) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE2, k_eq_8) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE2, strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE2, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE2, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE2, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE2, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE2, k_lt_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE2, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE2, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE2, k_gt_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE2, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE2, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE2, k_div_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE2, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE2, k_div_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE2, n_gt_4) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE2, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE2, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE2, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE2, n_div_4) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE2, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE2, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE2, n_div_4_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE2, strided_cm_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE2, strided_cm) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE41, k_eq_8) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE41, strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE41, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE41, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE41, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE41, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE41, k_lt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE41, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE41, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE41, k_gt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE41, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE41, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE41, k_div_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE41, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE41, k_div_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE41, n_gt_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE41, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE41, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE41, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE41, n_div_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE41, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE41, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE41, n_div_4_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE41, strided_cm_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE41, strided_cm) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE41, k_eq_8) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE41, strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE41, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE41, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE41, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE41, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE41, k_lt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE41, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE41, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE41, k_gt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE41, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE41, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE41, k_div_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE41, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE41, k_div_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE41, n_gt_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE41, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE41, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE41, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE41, n_div_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE41, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE41, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE41, n_div_4_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE41, strided_cm_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE41, strided_cm) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE41, k_eq_8) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE41, strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE41, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE41, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE41, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t m = 1; m <= 3; m++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE41, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE41, k_lt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE41, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE41, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE41, k_gt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE41, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE41, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE41, k_div_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE41, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE41, k_div_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE41, n_gt_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE41, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE41, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE41, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE41, n_div_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE41, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE41, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE41, n_div_4_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE41, strided_cm_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE41, strided_cm) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE41, k_eq_8) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE41, strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE41, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE41, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE41, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE41, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE41, k_lt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE41, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE41, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE41, k_gt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE41, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE41, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE41, k_div_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE41, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE41, k_div_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE41, n_gt_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE41, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE41, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE41, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE41, n_div_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE41, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE41, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE41, n_div_4_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE41, strided_cm_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE41, strided_cm) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__AVX, k_eq_8) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__AVX, strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__AVX, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__AVX, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__AVX, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__AVX, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__AVX, k_lt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__AVX, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__AVX, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__AVX, k_gt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__AVX, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__AVX, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__AVX, k_div_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__AVX, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__AVX, k_div_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__AVX, n_gt_4) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__AVX, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__AVX, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__AVX, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__AVX, n_div_4) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__AVX, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__AVX, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__AVX, n_div_4_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__AVX, strided_cm_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__AVX, strided_cm) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__AVX, k_eq_8) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__AVX, strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__AVX, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__AVX, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__AVX, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__AVX, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__AVX, k_lt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__AVX, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__AVX, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__AVX, k_gt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__AVX, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__AVX, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__AVX, k_div_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__AVX, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__AVX, k_div_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__AVX, n_gt_4) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__AVX, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__AVX, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__AVX, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__AVX, n_div_4) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__AVX, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__AVX, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__AVX, n_div_4_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__AVX, strided_cm_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__AVX, strided_cm) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__AVX, k_eq_8) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__AVX, strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__AVX, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__AVX, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__AVX, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t m = 1; m <= 3; m++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__AVX, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__AVX, k_lt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__AVX, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__AVX, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__AVX, k_gt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__AVX, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__AVX, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__AVX, k_div_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__AVX, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__AVX, k_div_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__AVX, n_gt_4) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__AVX, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__AVX, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__AVX, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__AVX, n_div_4) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__AVX, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__AVX, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__AVX, n_div_4_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__AVX, strided_cm_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__AVX, strided_cm) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__AVX, k_eq_8) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__AVX, strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__AVX, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__AVX, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__AVX, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__AVX, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__AVX, k_lt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__AVX, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__AVX, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__AVX, k_gt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__AVX, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__AVX, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__AVX, k_div_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__AVX, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__AVX, k_div_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__AVX, n_gt_4) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__AVX, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__AVX, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__AVX, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__AVX, n_div_4) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__AVX, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__AVX, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__AVX, n_div_4_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__AVX, strided_cm_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__AVX, strided_cm) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__XOP, k_eq_8) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__XOP, strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__XOP, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__XOP, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__XOP, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__XOP, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__XOP, k_lt_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__XOP, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__XOP, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__XOP, k_gt_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__XOP, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__XOP, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__XOP, k_div_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__XOP, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(1)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__XOP, k_div_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__XOP, n_gt_4) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__XOP, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__XOP, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__XOP, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__XOP, n_div_4) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__XOP, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__XOP, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(1)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__XOP, n_div_4_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__XOP, strided_cm_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(1)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__XOP, strided_cm) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(1)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__XOP, k_eq_8) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__XOP, strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__XOP, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__XOP, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__XOP, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__XOP, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__XOP, k_lt_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__XOP, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__XOP, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__XOP, k_gt_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__XOP, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__XOP, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__XOP, k_div_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__XOP, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(2)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__XOP, k_div_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__XOP, n_gt_4) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__XOP, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__XOP, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__XOP, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__XOP, n_div_4) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__XOP, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__XOP, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(2)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__XOP, n_div_4_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__XOP, strided_cm_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(2)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__XOP, strided_cm) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(2)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__XOP, k_eq_8) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__XOP, strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__XOP, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__XOP, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__XOP, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t m = 1; m <= 3; m++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__XOP, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__XOP, k_lt_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__XOP, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__XOP, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__XOP, k_gt_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__XOP, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__XOP, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__XOP, k_div_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__XOP, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(3)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__XOP, k_div_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__XOP, n_gt_4) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__XOP, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__XOP, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__XOP, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__XOP, n_div_4) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__XOP, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__XOP, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(3)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__XOP, n_div_4_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__XOP, strided_cm_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(3)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__XOP, strided_cm) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(3)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__XOP, k_eq_8) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__XOP, strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__XOP, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__XOP, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__XOP, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__XOP, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__XOP, k_lt_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__XOP, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__XOP, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__XOP, k_gt_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__XOP, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__XOP, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__XOP, k_div_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__XOP, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(4)
+        .nr(4)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__XOP, k_div_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__XOP, n_gt_4) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__XOP, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__XOP, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__XOP, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__XOP, n_div_4) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__XOP, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__XOP, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(4)
+          .nr(4)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__XOP, n_div_4_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__XOP, strided_cm_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(4)
+            .nr(4)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__XOP, strided_cm) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(4)
+      .nr(4)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE2, k_eq_8) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE2, strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE2, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE2, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE2, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE2, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE2, k_lt_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE2, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE2, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE2, k_gt_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE2, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE2, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE2, k_div_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE2, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE2, k_div_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE2, n_gt_4) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE2, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE2, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE2, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE2, n_div_4) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE2, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE2, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE2, n_div_4_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE2, strided_cm_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE2, strided_cm) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE2, k_eq_8) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE2, strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE2, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE2, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE2, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE2, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE2, k_lt_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE2, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE2, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE2, k_gt_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE2, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE2, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE2, k_div_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE2, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE2, k_div_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE2, n_gt_4) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE2, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE2, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE2, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE2, n_div_4) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE2, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE2, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE2, n_div_4_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE2, strided_cm_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE2, strided_cm) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE2, k_eq_8) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE2, strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE2, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE2, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE2, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t m = 1; m <= 3; m++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE2, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE2, k_lt_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE2, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE2, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE2, k_gt_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE2, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE2, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE2, k_div_8) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE2, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE2, k_div_8_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE2, n_gt_4) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE2, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE2, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE2, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE2, n_div_4) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE2, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE2, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE2, n_div_4_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE2, strided_cm_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE2, strided_cm) {
+    TEST_REQUIRES_X86_SSE2;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSSE3, k_eq_8) {
+    TEST_REQUIRES_X86_SSSE3;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSSE3, strided_cn) {
+    TEST_REQUIRES_X86_SSSE3;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSSE3, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_SSSE3;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSSE3, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSSE3, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSSE3, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSSE3, k_lt_8) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSSE3, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSSE3, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSSE3, k_gt_8) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSSE3, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSSE3, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSSE3, k_div_8) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSSE3, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSSE3, k_div_8_subtile) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSSE3, n_gt_4) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSSE3, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSSE3, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSSE3, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSSE3, n_div_4) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSSE3, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSSE3, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSSE3, n_div_4_subtile) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSSE3, strided_cm_subtile) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSSE3, strided_cm) {
+    TEST_REQUIRES_X86_SSSE3;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSSE3, k_eq_8) {
+    TEST_REQUIRES_X86_SSSE3;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSSE3, strided_cn) {
+    TEST_REQUIRES_X86_SSSE3;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSSE3, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_SSSE3;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSSE3, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSSE3, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSSE3, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSSE3, k_lt_8) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSSE3, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSSE3, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSSE3, k_gt_8) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSSE3, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSSE3, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSSE3, k_div_8) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSSE3, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSSE3, k_div_8_subtile) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSSE3, n_gt_4) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSSE3, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSSE3, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSSE3, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSSE3, n_div_4) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSSE3, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSSE3, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSSE3, n_div_4_subtile) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSSE3, strided_cm_subtile) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSSE3, strided_cm) {
+    TEST_REQUIRES_X86_SSSE3;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSSE3, k_eq_8) {
+    TEST_REQUIRES_X86_SSSE3;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSSE3, strided_cn) {
+    TEST_REQUIRES_X86_SSSE3;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSSE3, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_SSSE3;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSSE3, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSSE3, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (uint32_t m = 1; m <= 3; m++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSSE3, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSSE3, k_lt_8) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSSE3, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSSE3, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSSE3, k_gt_8) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSSE3, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSSE3, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSSE3, k_div_8) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSSE3, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSSE3, k_div_8_subtile) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSSE3, n_gt_4) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSSE3, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSSE3, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSSE3, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSSE3, n_div_4) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSSE3, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSSE3, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSSE3, n_div_4_subtile) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSSE3, strided_cm_subtile) {
+    TEST_REQUIRES_X86_SSSE3;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSSE3, strided_cm) {
+    TEST_REQUIRES_X86_SSSE3;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE41, k_eq_8) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE41, strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE41, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE41, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE41, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE41, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE41, k_lt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE41, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE41, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE41, k_gt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE41, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE41, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE41, k_div_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE41, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE41, k_div_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE41, n_gt_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE41, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE41, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE41, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE41, n_div_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE41, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE41, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE41, n_div_4_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE41, strided_cm_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE41, strided_cm) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE41, k_eq_8) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE41, strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE41, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE41, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE41, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE41, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE41, k_lt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE41, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE41, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE41, k_gt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE41, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE41, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE41, k_div_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE41, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE41, k_div_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE41, n_gt_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE41, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE41, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE41, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE41, n_div_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE41, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE41, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE41, n_div_4_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE41, strided_cm_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE41, strided_cm) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE41, k_eq_8) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE41, strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE41, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE41, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE41, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t m = 1; m <= 3; m++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE41, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE41, k_lt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE41, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE41, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE41, k_gt_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE41, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE41, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE41, k_div_8) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE41, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE41, k_div_8_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE41, n_gt_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE41, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE41, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE41, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE41, n_div_4) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE41, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE41, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE41, n_div_4_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE41, strided_cm_subtile) {
+    TEST_REQUIRES_X86_SSE41;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE41, strided_cm) {
+    TEST_REQUIRES_X86_SSE41;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__AVX, k_eq_8) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__AVX, strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__AVX, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__AVX, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__AVX, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__AVX, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__AVX, k_lt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__AVX, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__AVX, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__AVX, k_gt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__AVX, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__AVX, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__AVX, k_div_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__AVX, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__AVX, k_div_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__AVX, n_gt_4) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__AVX, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__AVX, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__AVX, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__AVX, n_div_4) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__AVX, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__AVX, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__AVX, n_div_4_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__AVX, strided_cm_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__AVX, strided_cm) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__AVX, k_eq_8) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__AVX, strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__AVX, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__AVX, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__AVX, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__AVX, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__AVX, k_lt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__AVX, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__AVX, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__AVX, k_gt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__AVX, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__AVX, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__AVX, k_div_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__AVX, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__AVX, k_div_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__AVX, n_gt_4) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__AVX, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__AVX, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__AVX, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__AVX, n_div_4) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__AVX, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__AVX, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__AVX, n_div_4_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__AVX, strided_cm_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__AVX, strided_cm) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__AVX, k_eq_8) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__AVX, strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__AVX, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__AVX, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__AVX, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t m = 1; m <= 3; m++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__AVX, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__AVX, k_lt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__AVX, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__AVX, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__AVX, k_gt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__AVX, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__AVX, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__AVX, k_div_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__AVX, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__AVX, k_div_8_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__AVX, n_gt_4) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__AVX, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__AVX, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__AVX, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__AVX, n_div_4) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__AVX, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__AVX, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__AVX, n_div_4_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__AVX, strided_cm_subtile) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__AVX, strided_cm) {
+    TEST_REQUIRES_X86_AVX;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__XOP, k_eq_8) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__XOP, strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__XOP, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__XOP, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__XOP, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__XOP, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__XOP, k_lt_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__XOP, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__XOP, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__XOP, k_gt_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__XOP, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__XOP, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__XOP, k_div_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__XOP, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(1)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__XOP, k_div_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__XOP, n_gt_4) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__XOP, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__XOP, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__XOP, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__XOP, n_div_4) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(4)
+          .k(k)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__XOP, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__XOP, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(1)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__XOP, n_div_4_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__XOP, strided_cm_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(1)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__XOP, strided_cm) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(1)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__XOP, k_eq_8) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__XOP, strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__XOP, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__XOP, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__XOP, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__XOP, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__XOP, k_lt_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__XOP, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__XOP, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__XOP, k_gt_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__XOP, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__XOP, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__XOP, k_div_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__XOP, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(2)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__XOP, k_div_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__XOP, n_gt_4) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__XOP, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__XOP, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__XOP, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__XOP, n_div_4) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(4)
+          .k(k)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__XOP, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__XOP, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(2)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__XOP, n_div_4_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__XOP, strided_cm_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(2)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__XOP, strided_cm) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(2)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__XOP, k_eq_8) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__XOP, strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cn_stride(7)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__XOP, k_eq_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__XOP, k_eq_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__XOP, k_eq_8_subtile_m) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t m = 1; m <= 3; m++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__XOP, k_eq_8_subtile_n) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__XOP, k_lt_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__XOP, k_lt_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__XOP, k_lt_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__XOP, k_gt_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__XOP, k_gt_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__XOP, k_gt_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__XOP, k_div_8) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__XOP, k_div_8_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .extended_weights(true)
+        .mr(3)
+        .nr(4)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(4)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__XOP, k_div_8_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__XOP, n_gt_4) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__XOP, n_gt_4_strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__XOP, n_gt_4_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__XOP, n_gt_4_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__XOP, n_div_4) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(4)
+          .k(k)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__XOP, n_div_4_strided_cn) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__XOP, n_div_4_strided_a) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .extended_weights(true)
+          .mr(3)
+          .nr(4)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__XOP, n_div_4_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__XOP, strided_cm_subtile) {
+    TEST_REQUIRES_X86_XOP;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .extended_weights(true)
+            .mr(3)
+            .nr(4)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__XOP, strided_cm) {
+    TEST_REQUIRES_X86_XOP;
+    GemmMicrokernelTester()
+      .extended_weights(true)
+      .mr(3)
+      .nr(4)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(4)
+      .k(8)
+      .cm_stride(7)
+      .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AVX2, k_eq_8) {
     TEST_REQUIRES_X86_AVX2;
     GemmMicrokernelTester()
diff --git a/test/qs8-gemm-minmax-fp32.yaml b/test/qs8-gemm-minmax-fp32.yaml
index 3972b36..f0ffdb6 100644
--- a/test/qs8-gemm-minmax-fp32.yaml
+++ b/test/qs8-gemm-minmax-fp32.yaml
@@ -303,6 +303,99 @@
 - name: xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128
   init: xnn_init_qs8_conv_minmax_fp32_sse4_params
   k-block: 8
+- name: xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse2
+  init: xnn_init_qs8_conv_minmax_fp32_sse2_params
+  k-block: 8
+- name: xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse2
+  init: xnn_init_qs8_conv_minmax_fp32_sse2_params
+  k-block: 8
+- name: xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse2
+  init: xnn_init_qs8_conv_minmax_fp32_sse2_params
+  k-block: 8
+- name: xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse2
+  init: xnn_init_qs8_conv_minmax_fp32_sse2_params
+  k-block: 8
+- name: xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse41
+  init: xnn_init_qs8_conv_minmax_fp32_sse4_params
+  k-block: 8
+- name: xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse41
+  init: xnn_init_qs8_conv_minmax_fp32_sse4_params
+  k-block: 8
+- name: xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse41
+  init: xnn_init_qs8_conv_minmax_fp32_sse4_params
+  k-block: 8
+- name: xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41
+  init: xnn_init_qs8_conv_minmax_fp32_sse4_params
+  k-block: 8
+- name: xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__avx
+  init: xnn_init_qs8_conv_minmax_fp32_sse4_params
+  k-block: 8
+- name: xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__avx
+  init: xnn_init_qs8_conv_minmax_fp32_sse4_params
+  k-block: 8
+- name: xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__avx
+  init: xnn_init_qs8_conv_minmax_fp32_sse4_params
+  k-block: 8
+- name: xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx
+  init: xnn_init_qs8_conv_minmax_fp32_sse4_params
+  k-block: 8
+- name: xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__xop
+  init: xnn_init_qs8_conv_minmax_fp32_sse4_params
+  k-block: 8
+- name: xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__xop
+  init: xnn_init_qs8_conv_minmax_fp32_sse4_params
+  k-block: 8
+- name: xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__xop
+  init: xnn_init_qs8_conv_minmax_fp32_sse4_params
+  k-block: 8
+- name: xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop
+  init: xnn_init_qs8_conv_minmax_fp32_sse4_params
+  k-block: 8
+- name: xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse2
+  init: xnn_init_qs8_conv_minmax_fp32_sse2_params
+  k-block: 8
+- name: xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse2
+  init: xnn_init_qs8_conv_minmax_fp32_sse2_params
+  k-block: 8
+- name: xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse2
+  init: xnn_init_qs8_conv_minmax_fp32_sse2_params
+  k-block: 8
+- name: xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__ssse3
+  init: xnn_init_qs8_conv_minmax_fp32_sse2_params
+  k-block: 8
+- name: xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__ssse3
+  init: xnn_init_qs8_conv_minmax_fp32_sse2_params
+  k-block: 8
+- name: xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__ssse3
+  init: xnn_init_qs8_conv_minmax_fp32_sse2_params
+  k-block: 8
+- name: xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse41
+  init: xnn_init_qs8_conv_minmax_fp32_sse4_params
+  k-block: 8
+- name: xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse41
+  init: xnn_init_qs8_conv_minmax_fp32_sse4_params
+  k-block: 8
+- name: xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse41
+  init: xnn_init_qs8_conv_minmax_fp32_sse4_params
+  k-block: 8
+- name: xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__avx
+  init: xnn_init_qs8_conv_minmax_fp32_sse4_params
+  k-block: 8
+- name: xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__avx
+  init: xnn_init_qs8_conv_minmax_fp32_sse4_params
+  k-block: 8
+- name: xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__avx
+  init: xnn_init_qs8_conv_minmax_fp32_sse4_params
+  k-block: 8
+- name: xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__xop
+  init: xnn_init_qs8_conv_minmax_fp32_sse4_params
+  k-block: 8
+- name: xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__xop
+  init: xnn_init_qs8_conv_minmax_fp32_sse4_params
+  k-block: 8
+- name: xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__xop
+  init: xnn_init_qs8_conv_minmax_fp32_sse4_params
+  k-block: 8
 - name: xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__avx2
   init: xnn_init_qs8_conv_minmax_fp32_avx2_params
   k-block: 8
diff --git a/test/qs8-gemm-minmax-gemmlowp.cc b/test/qs8-gemm-minmax-gemmlowp.cc
index a160ecf..5c62d4b 100644
--- a/test/qs8-gemm-minmax-gemmlowp.cc
+++ b/test/qs8-gemm-minmax-gemmlowp.cc
@@ -36503,4536 +36503,6 @@
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE2, k_eq_8) {
-    TEST_REQUIRES_X86_SSE2;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(1)
-      .nr(4)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(4)
-      .k(8)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE2, strided_cn) {
-    TEST_REQUIRES_X86_SSE2;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(1)
-      .nr(4)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(4)
-      .k(8)
-      .cn_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE2, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(1)
-      .nr(4)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE2, k_eq_8_subtile) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t m = 1; m <= 1; m++) {
-      for (uint32_t n = 1; n <= 4; n++) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(8)
-          .iterations(1)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE2, k_eq_8_subtile_m) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t m = 1; m <= 1; m++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(m)
-        .n(4)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE2, k_eq_8_subtile_n) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t n = 1; n <= 4; n++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(n)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE2, k_lt_8) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE2, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE2, k_lt_8_subtile) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 1; k < 8; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(1)
-            .nr(4)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE2, k_gt_8) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE2, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE2, k_gt_8_subtile) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 9; k < 16; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(1)
-            .nr(4)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE2, k_div_8) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE2, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE2, k_div_8_subtile) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(1)
-            .nr(4)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE2, n_gt_4) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(4)
-          .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE2, n_gt_4_strided_cn) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(4)
-          .k(k)
-          .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE2, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE2, n_gt_4_subtile) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(1)
-            .nr(4)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE2, n_div_4) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(4)
-          .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE2, n_div_4_strided_cn) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE2, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE2, n_div_4_subtile) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(1)
-            .nr(4)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE2, strided_cm_subtile) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(1)
-            .nr(4)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(7)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE2, strided_cm) {
-    TEST_REQUIRES_X86_SSE2;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(1)
-      .nr(4)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(4)
-      .k(8)
-      .cm_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE2, k_eq_8) {
-    TEST_REQUIRES_X86_SSE2;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(4)
-      .nr(4)
-      .kr(2)
-      .sr(1)
-      .m(4)
-      .n(4)
-      .k(8)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE2, strided_cn) {
-    TEST_REQUIRES_X86_SSE2;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(4)
-      .nr(4)
-      .kr(2)
-      .sr(1)
-      .m(4)
-      .n(4)
-      .k(8)
-      .cn_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE2, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(4)
-      .nr(4)
-      .kr(2)
-      .sr(1)
-      .m(4)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE2, k_eq_8_subtile) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t m = 1; m <= 4; m++) {
-      for (uint32_t n = 1; n <= 4; n++) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(4)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(8)
-          .iterations(1)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE2, k_eq_8_subtile_m) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t m = 1; m <= 4; m++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(4)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(m)
-        .n(4)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE2, k_eq_8_subtile_n) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t n = 1; n <= 4; n++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(4)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(n)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE2, k_lt_8) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(4)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE2, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(4)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE2, k_lt_8_subtile) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 1; k < 8; k++) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(4)
-            .nr(4)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE2, k_gt_8) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(4)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE2, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(4)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE2, k_gt_8_subtile) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 9; k < 16; k++) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(4)
-            .nr(4)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE2, k_div_8) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(4)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE2, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(4)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE2, k_div_8_subtile) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(4)
-            .nr(4)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE2, n_gt_4) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(4)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(4)
-          .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE2, n_gt_4_strided_cn) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(4)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(4)
-          .k(k)
-          .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE2, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(4)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE2, n_gt_4_subtile) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(4)
-            .nr(4)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE2, n_div_4) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(4)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(4)
-          .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE2, n_div_4_strided_cn) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(4)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE2, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(4)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE2, n_div_4_subtile) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(4)
-            .nr(4)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE2, strided_cm_subtile) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(4)
-            .nr(4)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(7)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE2, strided_cm) {
-    TEST_REQUIRES_X86_SSE2;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(4)
-      .nr(4)
-      .kr(2)
-      .sr(1)
-      .m(4)
-      .n(4)
-      .k(8)
-      .cm_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSSE3, k_eq_8) {
-    TEST_REQUIRES_X86_SSSE3;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(1)
-      .nr(4)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(4)
-      .k(8)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSSE3, strided_cn) {
-    TEST_REQUIRES_X86_SSSE3;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(1)
-      .nr(4)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(4)
-      .k(8)
-      .cn_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSSE3, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_SSSE3;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(1)
-      .nr(4)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSSE3, k_eq_8_subtile) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (uint32_t m = 1; m <= 1; m++) {
-      for (uint32_t n = 1; n <= 4; n++) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(8)
-          .iterations(1)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSSE3, k_eq_8_subtile_m) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (uint32_t m = 1; m <= 1; m++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(m)
-        .n(4)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSSE3, k_eq_8_subtile_n) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (uint32_t n = 1; n <= 4; n++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(n)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSSE3, k_lt_8) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSSE3, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSSE3, k_lt_8_subtile) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t k = 1; k < 8; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(1)
-            .nr(4)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSSE3, k_gt_8) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSSE3, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSSE3, k_gt_8_subtile) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t k = 9; k < 16; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(1)
-            .nr(4)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSSE3, k_div_8) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSSE3, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSSE3, k_div_8_subtile) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(1)
-            .nr(4)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSSE3, n_gt_4) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(4)
-          .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSSE3, n_gt_4_strided_cn) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(4)
-          .k(k)
-          .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSSE3, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSSE3, n_gt_4_subtile) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(1)
-            .nr(4)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSSE3, n_div_4) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(4)
-          .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSSE3, n_div_4_strided_cn) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSSE3, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSSE3, n_div_4_subtile) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(1)
-            .nr(4)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSSE3, strided_cm_subtile) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(1)
-            .nr(4)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(7)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSSE3, strided_cm) {
-    TEST_REQUIRES_X86_SSSE3;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(1)
-      .nr(4)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(4)
-      .k(8)
-      .cm_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSSE3, k_eq_8) {
-    TEST_REQUIRES_X86_SSSE3;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(4)
-      .nr(4)
-      .kr(2)
-      .sr(1)
-      .m(4)
-      .n(4)
-      .k(8)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSSE3, strided_cn) {
-    TEST_REQUIRES_X86_SSSE3;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(4)
-      .nr(4)
-      .kr(2)
-      .sr(1)
-      .m(4)
-      .n(4)
-      .k(8)
-      .cn_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSSE3, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_SSSE3;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(4)
-      .nr(4)
-      .kr(2)
-      .sr(1)
-      .m(4)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSSE3, k_eq_8_subtile) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (uint32_t m = 1; m <= 4; m++) {
-      for (uint32_t n = 1; n <= 4; n++) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(4)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(8)
-          .iterations(1)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSSE3, k_eq_8_subtile_m) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (uint32_t m = 1; m <= 4; m++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(4)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(m)
-        .n(4)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSSE3, k_eq_8_subtile_n) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (uint32_t n = 1; n <= 4; n++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(4)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(n)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSSE3, k_lt_8) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(4)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSSE3, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(4)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSSE3, k_lt_8_subtile) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t k = 1; k < 8; k++) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(4)
-            .nr(4)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSSE3, k_gt_8) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(4)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSSE3, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(4)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSSE3, k_gt_8_subtile) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t k = 9; k < 16; k++) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(4)
-            .nr(4)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSSE3, k_div_8) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(4)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSSE3, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(4)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSSE3, k_div_8_subtile) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(4)
-            .nr(4)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSSE3, n_gt_4) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(4)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(4)
-          .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSSE3, n_gt_4_strided_cn) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(4)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(4)
-          .k(k)
-          .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSSE3, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(4)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSSE3, n_gt_4_subtile) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(4)
-            .nr(4)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSSE3, n_div_4) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(4)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(4)
-          .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSSE3, n_div_4_strided_cn) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(4)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSSE3, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(4)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSSE3, n_div_4_subtile) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(4)
-            .nr(4)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSSE3, strided_cm_subtile) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(4)
-            .nr(4)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(7)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSSE3, strided_cm) {
-    TEST_REQUIRES_X86_SSSE3;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(4)
-      .nr(4)
-      .kr(2)
-      .sr(1)
-      .m(4)
-      .n(4)
-      .k(8)
-      .cm_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE41, k_eq_8) {
-    TEST_REQUIRES_X86_SSE41;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(1)
-      .nr(4)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(4)
-      .k(8)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE41, strided_cn) {
-    TEST_REQUIRES_X86_SSE41;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(1)
-      .nr(4)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(4)
-      .k(8)
-      .cn_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE41, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(1)
-      .nr(4)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE41, k_eq_8_subtile) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t m = 1; m <= 1; m++) {
-      for (uint32_t n = 1; n <= 4; n++) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(8)
-          .iterations(1)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE41, k_eq_8_subtile_m) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t m = 1; m <= 1; m++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(m)
-        .n(4)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE41, k_eq_8_subtile_n) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t n = 1; n <= 4; n++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(n)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE41, k_lt_8) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE41, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE41, k_lt_8_subtile) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 1; k < 8; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(1)
-            .nr(4)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE41, k_gt_8) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE41, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE41, k_gt_8_subtile) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 9; k < 16; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(1)
-            .nr(4)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE41, k_div_8) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE41, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE41, k_div_8_subtile) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(1)
-            .nr(4)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE41, n_gt_4) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(4)
-          .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE41, n_gt_4_strided_cn) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(4)
-          .k(k)
-          .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE41, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE41, n_gt_4_subtile) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(1)
-            .nr(4)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE41, n_div_4) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(4)
-          .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE41, n_div_4_strided_cn) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE41, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE41, n_div_4_subtile) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(1)
-            .nr(4)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE41, strided_cm_subtile) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(1)
-            .nr(4)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(7)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE41, strided_cm) {
-    TEST_REQUIRES_X86_SSE41;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(1)
-      .nr(4)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(4)
-      .k(8)
-      .cm_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE41, k_eq_8) {
-    TEST_REQUIRES_X86_SSE41;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(4)
-      .nr(4)
-      .kr(2)
-      .sr(1)
-      .m(4)
-      .n(4)
-      .k(8)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE41, strided_cn) {
-    TEST_REQUIRES_X86_SSE41;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(4)
-      .nr(4)
-      .kr(2)
-      .sr(1)
-      .m(4)
-      .n(4)
-      .k(8)
-      .cn_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE41, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(4)
-      .nr(4)
-      .kr(2)
-      .sr(1)
-      .m(4)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE41, k_eq_8_subtile) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t m = 1; m <= 4; m++) {
-      for (uint32_t n = 1; n <= 4; n++) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(4)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(8)
-          .iterations(1)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE41, k_eq_8_subtile_m) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t m = 1; m <= 4; m++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(4)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(m)
-        .n(4)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE41, k_eq_8_subtile_n) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t n = 1; n <= 4; n++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(4)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(n)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE41, k_lt_8) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(4)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE41, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(4)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE41, k_lt_8_subtile) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 1; k < 8; k++) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(4)
-            .nr(4)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE41, k_gt_8) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(4)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE41, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(4)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE41, k_gt_8_subtile) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 9; k < 16; k++) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(4)
-            .nr(4)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE41, k_div_8) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(4)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE41, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(4)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE41, k_div_8_subtile) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(4)
-            .nr(4)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE41, n_gt_4) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(4)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(4)
-          .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE41, n_gt_4_strided_cn) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(4)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(4)
-          .k(k)
-          .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE41, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(4)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE41, n_gt_4_subtile) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(4)
-            .nr(4)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE41, n_div_4) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(4)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(4)
-          .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE41, n_div_4_strided_cn) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(4)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE41, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(4)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE41, n_div_4_subtile) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(4)
-            .nr(4)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE41, strided_cm_subtile) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(4)
-            .nr(4)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(7)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE41, strided_cm) {
-    TEST_REQUIRES_X86_SSE41;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(4)
-      .nr(4)
-      .kr(2)
-      .sr(1)
-      .m(4)
-      .n(4)
-      .k(8)
-      .cm_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__AVX, k_eq_8) {
-    TEST_REQUIRES_X86_AVX;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(1)
-      .nr(4)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(4)
-      .k(8)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__AVX, strided_cn) {
-    TEST_REQUIRES_X86_AVX;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(1)
-      .nr(4)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(4)
-      .k(8)
-      .cn_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__AVX, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(1)
-      .nr(4)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__AVX, k_eq_8_subtile) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t m = 1; m <= 1; m++) {
-      for (uint32_t n = 1; n <= 4; n++) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(8)
-          .iterations(1)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__AVX, k_eq_8_subtile_m) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t m = 1; m <= 1; m++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(m)
-        .n(4)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__AVX, k_eq_8_subtile_n) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t n = 1; n <= 4; n++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(n)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__AVX, k_lt_8) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__AVX, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__AVX, k_lt_8_subtile) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 1; k < 8; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(1)
-            .nr(4)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__AVX, k_gt_8) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__AVX, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__AVX, k_gt_8_subtile) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 9; k < 16; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(1)
-            .nr(4)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__AVX, k_div_8) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__AVX, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__AVX, k_div_8_subtile) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(1)
-            .nr(4)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__AVX, n_gt_4) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(4)
-          .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__AVX, n_gt_4_strided_cn) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(4)
-          .k(k)
-          .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__AVX, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__AVX, n_gt_4_subtile) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(1)
-            .nr(4)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__AVX, n_div_4) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(4)
-          .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__AVX, n_div_4_strided_cn) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__AVX, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__AVX, n_div_4_subtile) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(1)
-            .nr(4)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__AVX, strided_cm_subtile) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(1)
-            .nr(4)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(7)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__AVX, strided_cm) {
-    TEST_REQUIRES_X86_AVX;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(1)
-      .nr(4)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(4)
-      .k(8)
-      .cm_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__AVX, k_eq_8) {
-    TEST_REQUIRES_X86_AVX;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(4)
-      .nr(4)
-      .kr(2)
-      .sr(1)
-      .m(4)
-      .n(4)
-      .k(8)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__AVX, strided_cn) {
-    TEST_REQUIRES_X86_AVX;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(4)
-      .nr(4)
-      .kr(2)
-      .sr(1)
-      .m(4)
-      .n(4)
-      .k(8)
-      .cn_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__AVX, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(4)
-      .nr(4)
-      .kr(2)
-      .sr(1)
-      .m(4)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__AVX, k_eq_8_subtile) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t m = 1; m <= 4; m++) {
-      for (uint32_t n = 1; n <= 4; n++) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(4)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(8)
-          .iterations(1)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__AVX, k_eq_8_subtile_m) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t m = 1; m <= 4; m++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(4)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(m)
-        .n(4)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__AVX, k_eq_8_subtile_n) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t n = 1; n <= 4; n++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(4)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(n)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__AVX, k_lt_8) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(4)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__AVX, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(4)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__AVX, k_lt_8_subtile) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 1; k < 8; k++) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(4)
-            .nr(4)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__AVX, k_gt_8) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(4)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__AVX, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(4)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__AVX, k_gt_8_subtile) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 9; k < 16; k++) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(4)
-            .nr(4)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__AVX, k_div_8) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(4)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__AVX, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(4)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__AVX, k_div_8_subtile) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(4)
-            .nr(4)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__AVX, n_gt_4) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(4)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(4)
-          .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__AVX, n_gt_4_strided_cn) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(4)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(4)
-          .k(k)
-          .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__AVX, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(4)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__AVX, n_gt_4_subtile) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(4)
-            .nr(4)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__AVX, n_div_4) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(4)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(4)
-          .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__AVX, n_div_4_strided_cn) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(4)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__AVX, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(4)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__AVX, n_div_4_subtile) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(4)
-            .nr(4)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__AVX, strided_cm_subtile) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(4)
-            .nr(4)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(7)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__AVX, strided_cm) {
-    TEST_REQUIRES_X86_AVX;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(4)
-      .nr(4)
-      .kr(2)
-      .sr(1)
-      .m(4)
-      .n(4)
-      .k(8)
-      .cm_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__XOP, k_eq_8) {
-    TEST_REQUIRES_X86_XOP;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(1)
-      .nr(4)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(4)
-      .k(8)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__XOP, strided_cn) {
-    TEST_REQUIRES_X86_XOP;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(1)
-      .nr(4)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(4)
-      .k(8)
-      .cn_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__XOP, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(1)
-      .nr(4)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__XOP, k_eq_8_subtile) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t m = 1; m <= 1; m++) {
-      for (uint32_t n = 1; n <= 4; n++) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(8)
-          .iterations(1)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__XOP, k_eq_8_subtile_m) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t m = 1; m <= 1; m++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(m)
-        .n(4)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__XOP, k_eq_8_subtile_n) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t n = 1; n <= 4; n++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(n)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__XOP, k_lt_8) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__XOP, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__XOP, k_lt_8_subtile) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 1; k < 8; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(1)
-            .nr(4)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__XOP, k_gt_8) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__XOP, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__XOP, k_gt_8_subtile) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 9; k < 16; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(1)
-            .nr(4)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__XOP, k_div_8) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__XOP, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__XOP, k_div_8_subtile) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(1)
-            .nr(4)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__XOP, n_gt_4) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(4)
-          .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__XOP, n_gt_4_strided_cn) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(4)
-          .k(k)
-          .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__XOP, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__XOP, n_gt_4_subtile) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(1)
-            .nr(4)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__XOP, n_div_4) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(4)
-          .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__XOP, n_div_4_strided_cn) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__XOP, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__XOP, n_div_4_subtile) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(1)
-            .nr(4)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__XOP, strided_cm_subtile) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(1)
-            .nr(4)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(7)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__XOP, strided_cm) {
-    TEST_REQUIRES_X86_XOP;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(1)
-      .nr(4)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(4)
-      .k(8)
-      .cm_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__XOP, k_eq_8) {
-    TEST_REQUIRES_X86_XOP;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(4)
-      .nr(4)
-      .kr(2)
-      .sr(1)
-      .m(4)
-      .n(4)
-      .k(8)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__XOP, strided_cn) {
-    TEST_REQUIRES_X86_XOP;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(4)
-      .nr(4)
-      .kr(2)
-      .sr(1)
-      .m(4)
-      .n(4)
-      .k(8)
-      .cn_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__XOP, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(4)
-      .nr(4)
-      .kr(2)
-      .sr(1)
-      .m(4)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__XOP, k_eq_8_subtile) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t m = 1; m <= 4; m++) {
-      for (uint32_t n = 1; n <= 4; n++) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(4)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(8)
-          .iterations(1)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__XOP, k_eq_8_subtile_m) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t m = 1; m <= 4; m++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(4)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(m)
-        .n(4)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__XOP, k_eq_8_subtile_n) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t n = 1; n <= 4; n++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(4)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(n)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__XOP, k_lt_8) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(4)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__XOP, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(4)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__XOP, k_lt_8_subtile) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 1; k < 8; k++) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(4)
-            .nr(4)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__XOP, k_gt_8) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(4)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__XOP, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(4)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__XOP, k_gt_8_subtile) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 9; k < 16; k++) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(4)
-            .nr(4)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__XOP, k_div_8) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(4)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__XOP, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(4)
-        .nr(4)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__XOP, k_div_8_subtile) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(4)
-            .nr(4)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__XOP, n_gt_4) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(4)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(4)
-          .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__XOP, n_gt_4_strided_cn) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(4)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(4)
-          .k(k)
-          .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__XOP, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(4)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__XOP, n_gt_4_subtile) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(4)
-            .nr(4)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__XOP, n_div_4) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(4)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(4)
-          .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__XOP, n_div_4_strided_cn) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(4)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__XOP, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(4)
-          .nr(4)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__XOP, n_div_4_subtile) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(4)
-            .nr(4)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__XOP, strided_cm_subtile) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(4)
-            .nr(4)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(7)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__XOP, strided_cm) {
-    TEST_REQUIRES_X86_XOP;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(4)
-      .nr(4)
-      .kr(2)
-      .sr(1)
-      .m(4)
-      .n(4)
-      .k(8)
-      .cm_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X4C8__SSE2_LD64, k_eq_8) {
     TEST_REQUIRES_X86_SSE2;
     GemmMicrokernelTester()
@@ -42401,6801 +37871,6 @@
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE2, k_eq_8) {
-    TEST_REQUIRES_X86_SSE2;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(1)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(4)
-      .k(8)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE2, strided_cn) {
-    TEST_REQUIRES_X86_SSE2;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(1)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(4)
-      .k(8)
-      .cn_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE2, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(1)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE2, k_eq_8_subtile) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t m = 1; m <= 1; m++) {
-      for (uint32_t n = 1; n <= 4; n++) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(8)
-          .iterations(1)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE2, k_eq_8_subtile_m) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t m = 1; m <= 1; m++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(m)
-        .n(4)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE2, k_eq_8_subtile_n) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t n = 1; n <= 4; n++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(n)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE2, k_lt_8) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE2, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE2, k_lt_8_subtile) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 1; k < 8; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(1)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE2, k_gt_8) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE2, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE2, k_gt_8_subtile) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 9; k < 16; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(1)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE2, k_div_8) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE2, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE2, k_div_8_subtile) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(1)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE2, n_gt_4) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(4)
-          .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE2, n_gt_4_strided_cn) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(4)
-          .k(k)
-          .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE2, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE2, n_gt_4_subtile) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(1)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE2, n_div_4) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(4)
-          .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE2, n_div_4_strided_cn) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE2, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE2, n_div_4_subtile) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(1)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE2, strided_cm_subtile) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(1)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(7)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE2, strided_cm) {
-    TEST_REQUIRES_X86_SSE2;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(1)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(4)
-      .k(8)
-      .cm_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE2, k_eq_8) {
-    TEST_REQUIRES_X86_SSE2;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(2)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(4)
-      .k(8)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE2, strided_cn) {
-    TEST_REQUIRES_X86_SSE2;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(2)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(4)
-      .k(8)
-      .cn_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE2, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(2)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE2, k_eq_8_subtile) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t m = 1; m <= 2; m++) {
-      for (uint32_t n = 1; n <= 4; n++) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(2)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(8)
-          .iterations(1)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE2, k_eq_8_subtile_m) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t m = 1; m <= 2; m++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(2)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(m)
-        .n(4)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE2, k_eq_8_subtile_n) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t n = 1; n <= 4; n++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(2)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(n)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE2, k_lt_8) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(2)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE2, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(2)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE2, k_lt_8_subtile) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 1; k < 8; k++) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(2)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE2, k_gt_8) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(2)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE2, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(2)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE2, k_gt_8_subtile) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 9; k < 16; k++) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(2)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE2, k_div_8) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(2)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE2, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(2)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE2, k_div_8_subtile) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(2)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE2, n_gt_4) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(2)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(4)
-          .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE2, n_gt_4_strided_cn) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(2)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(4)
-          .k(k)
-          .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE2, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(2)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE2, n_gt_4_subtile) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(2)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE2, n_div_4) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(2)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(4)
-          .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE2, n_div_4_strided_cn) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(2)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE2, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(2)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE2, n_div_4_subtile) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(2)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE2, strided_cm_subtile) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(2)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(7)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE2, strided_cm) {
-    TEST_REQUIRES_X86_SSE2;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(2)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(4)
-      .k(8)
-      .cm_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE2, k_eq_8) {
-    TEST_REQUIRES_X86_SSE2;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(3)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(3)
-      .n(4)
-      .k(8)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE2, strided_cn) {
-    TEST_REQUIRES_X86_SSE2;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(3)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(3)
-      .n(4)
-      .k(8)
-      .cn_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE2, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(3)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(3)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE2, k_eq_8_subtile) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t m = 1; m <= 3; m++) {
-      for (uint32_t n = 1; n <= 4; n++) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(3)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(8)
-          .iterations(1)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE2, k_eq_8_subtile_m) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t m = 1; m <= 3; m++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(3)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(m)
-        .n(4)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE2, k_eq_8_subtile_n) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t n = 1; n <= 4; n++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(3)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(n)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE2, k_lt_8) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(3)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE2, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(3)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE2, k_lt_8_subtile) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 1; k < 8; k++) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(3)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE2, k_gt_8) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(3)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE2, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(3)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE2, k_gt_8_subtile) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 9; k < 16; k++) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(3)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE2, k_div_8) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(3)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE2, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(3)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE2, k_div_8_subtile) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(3)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE2, n_gt_4) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(3)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(3)
-          .n(4)
-          .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE2, n_gt_4_strided_cn) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(3)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(3)
-          .n(4)
-          .k(k)
-          .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE2, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(3)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE2, n_gt_4_subtile) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(3)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE2, n_div_4) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(3)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(3)
-          .n(4)
-          .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE2, n_div_4_strided_cn) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(3)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE2, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(3)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE2, n_div_4_subtile) {
-    TEST_REQUIRES_X86_SSE2;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(3)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE2, strided_cm_subtile) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(3)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(7)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE2, strided_cm) {
-    TEST_REQUIRES_X86_SSE2;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(3)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(3)
-      .n(4)
-      .k(8)
-      .cm_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSSE3, k_eq_8) {
-    TEST_REQUIRES_X86_SSSE3;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(1)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(4)
-      .k(8)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSSE3, strided_cn) {
-    TEST_REQUIRES_X86_SSSE3;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(1)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(4)
-      .k(8)
-      .cn_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSSE3, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_SSSE3;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(1)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSSE3, k_eq_8_subtile) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (uint32_t m = 1; m <= 1; m++) {
-      for (uint32_t n = 1; n <= 4; n++) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(8)
-          .iterations(1)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSSE3, k_eq_8_subtile_m) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (uint32_t m = 1; m <= 1; m++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(m)
-        .n(4)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSSE3, k_eq_8_subtile_n) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (uint32_t n = 1; n <= 4; n++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(n)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSSE3, k_lt_8) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSSE3, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSSE3, k_lt_8_subtile) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t k = 1; k < 8; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(1)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSSE3, k_gt_8) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSSE3, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSSE3, k_gt_8_subtile) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t k = 9; k < 16; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(1)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSSE3, k_div_8) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSSE3, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSSE3, k_div_8_subtile) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(1)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSSE3, n_gt_4) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(4)
-          .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSSE3, n_gt_4_strided_cn) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(4)
-          .k(k)
-          .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSSE3, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSSE3, n_gt_4_subtile) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(1)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSSE3, n_div_4) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(4)
-          .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSSE3, n_div_4_strided_cn) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSSE3, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSSE3, n_div_4_subtile) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(1)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSSE3, strided_cm_subtile) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(1)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(7)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSSE3, strided_cm) {
-    TEST_REQUIRES_X86_SSSE3;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(1)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(4)
-      .k(8)
-      .cm_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSSE3, k_eq_8) {
-    TEST_REQUIRES_X86_SSSE3;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(2)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(4)
-      .k(8)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSSE3, strided_cn) {
-    TEST_REQUIRES_X86_SSSE3;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(2)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(4)
-      .k(8)
-      .cn_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSSE3, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_SSSE3;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(2)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSSE3, k_eq_8_subtile) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (uint32_t m = 1; m <= 2; m++) {
-      for (uint32_t n = 1; n <= 4; n++) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(2)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(8)
-          .iterations(1)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSSE3, k_eq_8_subtile_m) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (uint32_t m = 1; m <= 2; m++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(2)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(m)
-        .n(4)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSSE3, k_eq_8_subtile_n) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (uint32_t n = 1; n <= 4; n++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(2)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(n)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSSE3, k_lt_8) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(2)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSSE3, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(2)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSSE3, k_lt_8_subtile) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t k = 1; k < 8; k++) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(2)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSSE3, k_gt_8) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(2)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSSE3, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(2)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSSE3, k_gt_8_subtile) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t k = 9; k < 16; k++) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(2)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSSE3, k_div_8) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(2)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSSE3, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(2)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSSE3, k_div_8_subtile) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(2)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSSE3, n_gt_4) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(2)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(4)
-          .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSSE3, n_gt_4_strided_cn) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(2)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(4)
-          .k(k)
-          .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSSE3, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(2)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSSE3, n_gt_4_subtile) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(2)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSSE3, n_div_4) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(2)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(4)
-          .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSSE3, n_div_4_strided_cn) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(2)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSSE3, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(2)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSSE3, n_div_4_subtile) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(2)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSSE3, strided_cm_subtile) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(2)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(7)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSSE3, strided_cm) {
-    TEST_REQUIRES_X86_SSSE3;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(2)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(4)
-      .k(8)
-      .cm_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSSE3, k_eq_8) {
-    TEST_REQUIRES_X86_SSSE3;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(3)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(3)
-      .n(4)
-      .k(8)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSSE3, strided_cn) {
-    TEST_REQUIRES_X86_SSSE3;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(3)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(3)
-      .n(4)
-      .k(8)
-      .cn_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSSE3, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_SSSE3;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(3)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(3)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSSE3, k_eq_8_subtile) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (uint32_t m = 1; m <= 3; m++) {
-      for (uint32_t n = 1; n <= 4; n++) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(3)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(8)
-          .iterations(1)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSSE3, k_eq_8_subtile_m) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (uint32_t m = 1; m <= 3; m++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(3)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(m)
-        .n(4)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSSE3, k_eq_8_subtile_n) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (uint32_t n = 1; n <= 4; n++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(3)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(n)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSSE3, k_lt_8) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(3)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSSE3, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(3)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSSE3, k_lt_8_subtile) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t k = 1; k < 8; k++) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(3)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSSE3, k_gt_8) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(3)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSSE3, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(3)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSSE3, k_gt_8_subtile) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t k = 9; k < 16; k++) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(3)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSSE3, k_div_8) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(3)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSSE3, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(3)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSSE3, k_div_8_subtile) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(3)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSSE3, n_gt_4) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(3)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(3)
-          .n(4)
-          .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSSE3, n_gt_4_strided_cn) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(3)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(3)
-          .n(4)
-          .k(k)
-          .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSSE3, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(3)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSSE3, n_gt_4_subtile) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(3)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSSE3, n_div_4) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(3)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(3)
-          .n(4)
-          .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSSE3, n_div_4_strided_cn) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(3)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSSE3, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(3)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSSE3, n_div_4_subtile) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(3)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSSE3, strided_cm_subtile) {
-    TEST_REQUIRES_X86_SSSE3;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(3)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(7)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSSE3, strided_cm) {
-    TEST_REQUIRES_X86_SSSE3;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(3)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(3)
-      .n(4)
-      .k(8)
-      .cm_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE41, k_eq_8) {
-    TEST_REQUIRES_X86_SSE41;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(1)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(4)
-      .k(8)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE41, strided_cn) {
-    TEST_REQUIRES_X86_SSE41;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(1)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(4)
-      .k(8)
-      .cn_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE41, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(1)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE41, k_eq_8_subtile) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t m = 1; m <= 1; m++) {
-      for (uint32_t n = 1; n <= 4; n++) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(8)
-          .iterations(1)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE41, k_eq_8_subtile_m) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t m = 1; m <= 1; m++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(m)
-        .n(4)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE41, k_eq_8_subtile_n) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t n = 1; n <= 4; n++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(n)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE41, k_lt_8) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE41, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE41, k_lt_8_subtile) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 1; k < 8; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(1)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE41, k_gt_8) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE41, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE41, k_gt_8_subtile) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 9; k < 16; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(1)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE41, k_div_8) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE41, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE41, k_div_8_subtile) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(1)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE41, n_gt_4) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(4)
-          .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE41, n_gt_4_strided_cn) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(4)
-          .k(k)
-          .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE41, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE41, n_gt_4_subtile) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(1)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE41, n_div_4) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(4)
-          .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE41, n_div_4_strided_cn) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE41, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE41, n_div_4_subtile) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(1)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE41, strided_cm_subtile) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(1)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(7)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE41, strided_cm) {
-    TEST_REQUIRES_X86_SSE41;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(1)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(4)
-      .k(8)
-      .cm_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE41, k_eq_8) {
-    TEST_REQUIRES_X86_SSE41;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(2)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(4)
-      .k(8)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE41, strided_cn) {
-    TEST_REQUIRES_X86_SSE41;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(2)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(4)
-      .k(8)
-      .cn_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE41, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(2)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE41, k_eq_8_subtile) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t m = 1; m <= 2; m++) {
-      for (uint32_t n = 1; n <= 4; n++) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(2)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(8)
-          .iterations(1)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE41, k_eq_8_subtile_m) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t m = 1; m <= 2; m++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(2)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(m)
-        .n(4)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE41, k_eq_8_subtile_n) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t n = 1; n <= 4; n++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(2)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(n)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE41, k_lt_8) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(2)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE41, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(2)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE41, k_lt_8_subtile) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 1; k < 8; k++) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(2)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE41, k_gt_8) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(2)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE41, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(2)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE41, k_gt_8_subtile) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 9; k < 16; k++) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(2)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE41, k_div_8) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(2)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE41, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(2)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE41, k_div_8_subtile) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(2)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE41, n_gt_4) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(2)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(4)
-          .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE41, n_gt_4_strided_cn) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(2)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(4)
-          .k(k)
-          .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE41, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(2)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE41, n_gt_4_subtile) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(2)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE41, n_div_4) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(2)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(4)
-          .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE41, n_div_4_strided_cn) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(2)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE41, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(2)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE41, n_div_4_subtile) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(2)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE41, strided_cm_subtile) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(2)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(7)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE41, strided_cm) {
-    TEST_REQUIRES_X86_SSE41;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(2)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(4)
-      .k(8)
-      .cm_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE41, k_eq_8) {
-    TEST_REQUIRES_X86_SSE41;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(3)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(3)
-      .n(4)
-      .k(8)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE41, strided_cn) {
-    TEST_REQUIRES_X86_SSE41;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(3)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(3)
-      .n(4)
-      .k(8)
-      .cn_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE41, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(3)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(3)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE41, k_eq_8_subtile) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t m = 1; m <= 3; m++) {
-      for (uint32_t n = 1; n <= 4; n++) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(3)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(8)
-          .iterations(1)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE41, k_eq_8_subtile_m) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t m = 1; m <= 3; m++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(3)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(m)
-        .n(4)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE41, k_eq_8_subtile_n) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t n = 1; n <= 4; n++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(3)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(n)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE41, k_lt_8) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(3)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE41, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(3)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE41, k_lt_8_subtile) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 1; k < 8; k++) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(3)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE41, k_gt_8) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(3)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE41, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(3)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE41, k_gt_8_subtile) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 9; k < 16; k++) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(3)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE41, k_div_8) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(3)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE41, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(3)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE41, k_div_8_subtile) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(3)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE41, n_gt_4) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(3)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(3)
-          .n(4)
-          .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE41, n_gt_4_strided_cn) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(3)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(3)
-          .n(4)
-          .k(k)
-          .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE41, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(3)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE41, n_gt_4_subtile) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(3)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE41, n_div_4) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(3)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(3)
-          .n(4)
-          .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE41, n_div_4_strided_cn) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(3)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE41, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(3)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE41, n_div_4_subtile) {
-    TEST_REQUIRES_X86_SSE41;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(3)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE41, strided_cm_subtile) {
-    TEST_REQUIRES_X86_SSE41;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(3)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(7)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE41, strided_cm) {
-    TEST_REQUIRES_X86_SSE41;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(3)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(3)
-      .n(4)
-      .k(8)
-      .cm_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__AVX, k_eq_8) {
-    TEST_REQUIRES_X86_AVX;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(1)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(4)
-      .k(8)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__AVX, strided_cn) {
-    TEST_REQUIRES_X86_AVX;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(1)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(4)
-      .k(8)
-      .cn_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__AVX, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(1)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__AVX, k_eq_8_subtile) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t m = 1; m <= 1; m++) {
-      for (uint32_t n = 1; n <= 4; n++) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(8)
-          .iterations(1)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__AVX, k_eq_8_subtile_m) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t m = 1; m <= 1; m++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(m)
-        .n(4)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__AVX, k_eq_8_subtile_n) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t n = 1; n <= 4; n++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(n)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__AVX, k_lt_8) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__AVX, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__AVX, k_lt_8_subtile) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 1; k < 8; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(1)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__AVX, k_gt_8) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__AVX, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__AVX, k_gt_8_subtile) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 9; k < 16; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(1)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__AVX, k_div_8) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__AVX, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__AVX, k_div_8_subtile) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(1)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__AVX, n_gt_4) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(4)
-          .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__AVX, n_gt_4_strided_cn) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(4)
-          .k(k)
-          .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__AVX, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__AVX, n_gt_4_subtile) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(1)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__AVX, n_div_4) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(4)
-          .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__AVX, n_div_4_strided_cn) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__AVX, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__AVX, n_div_4_subtile) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(1)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__AVX, strided_cm_subtile) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(1)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(7)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__AVX, strided_cm) {
-    TEST_REQUIRES_X86_AVX;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(1)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(4)
-      .k(8)
-      .cm_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__AVX, k_eq_8) {
-    TEST_REQUIRES_X86_AVX;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(2)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(4)
-      .k(8)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__AVX, strided_cn) {
-    TEST_REQUIRES_X86_AVX;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(2)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(4)
-      .k(8)
-      .cn_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__AVX, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(2)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__AVX, k_eq_8_subtile) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t m = 1; m <= 2; m++) {
-      for (uint32_t n = 1; n <= 4; n++) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(2)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(8)
-          .iterations(1)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__AVX, k_eq_8_subtile_m) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t m = 1; m <= 2; m++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(2)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(m)
-        .n(4)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__AVX, k_eq_8_subtile_n) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t n = 1; n <= 4; n++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(2)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(n)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__AVX, k_lt_8) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(2)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__AVX, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(2)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__AVX, k_lt_8_subtile) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 1; k < 8; k++) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(2)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__AVX, k_gt_8) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(2)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__AVX, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(2)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__AVX, k_gt_8_subtile) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 9; k < 16; k++) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(2)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__AVX, k_div_8) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(2)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__AVX, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(2)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__AVX, k_div_8_subtile) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(2)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__AVX, n_gt_4) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(2)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(4)
-          .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__AVX, n_gt_4_strided_cn) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(2)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(4)
-          .k(k)
-          .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__AVX, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(2)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__AVX, n_gt_4_subtile) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(2)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__AVX, n_div_4) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(2)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(4)
-          .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__AVX, n_div_4_strided_cn) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(2)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__AVX, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(2)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__AVX, n_div_4_subtile) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(2)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__AVX, strided_cm_subtile) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(2)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(7)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__AVX, strided_cm) {
-    TEST_REQUIRES_X86_AVX;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(2)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(4)
-      .k(8)
-      .cm_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__AVX, k_eq_8) {
-    TEST_REQUIRES_X86_AVX;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(3)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(3)
-      .n(4)
-      .k(8)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__AVX, strided_cn) {
-    TEST_REQUIRES_X86_AVX;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(3)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(3)
-      .n(4)
-      .k(8)
-      .cn_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__AVX, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(3)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(3)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__AVX, k_eq_8_subtile) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t m = 1; m <= 3; m++) {
-      for (uint32_t n = 1; n <= 4; n++) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(3)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(8)
-          .iterations(1)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__AVX, k_eq_8_subtile_m) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t m = 1; m <= 3; m++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(3)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(m)
-        .n(4)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__AVX, k_eq_8_subtile_n) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t n = 1; n <= 4; n++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(3)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(n)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__AVX, k_lt_8) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(3)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__AVX, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(3)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__AVX, k_lt_8_subtile) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 1; k < 8; k++) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(3)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__AVX, k_gt_8) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(3)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__AVX, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(3)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__AVX, k_gt_8_subtile) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 9; k < 16; k++) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(3)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__AVX, k_div_8) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(3)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__AVX, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(3)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__AVX, k_div_8_subtile) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(3)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__AVX, n_gt_4) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(3)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(3)
-          .n(4)
-          .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__AVX, n_gt_4_strided_cn) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(3)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(3)
-          .n(4)
-          .k(k)
-          .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__AVX, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(3)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__AVX, n_gt_4_subtile) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(3)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__AVX, n_div_4) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(3)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(3)
-          .n(4)
-          .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__AVX, n_div_4_strided_cn) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(3)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__AVX, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(3)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__AVX, n_div_4_subtile) {
-    TEST_REQUIRES_X86_AVX;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(3)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__AVX, strided_cm_subtile) {
-    TEST_REQUIRES_X86_AVX;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(3)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(7)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__AVX, strided_cm) {
-    TEST_REQUIRES_X86_AVX;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(3)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(3)
-      .n(4)
-      .k(8)
-      .cm_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__XOP, k_eq_8) {
-    TEST_REQUIRES_X86_XOP;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(1)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(4)
-      .k(8)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__XOP, strided_cn) {
-    TEST_REQUIRES_X86_XOP;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(1)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(4)
-      .k(8)
-      .cn_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__XOP, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(1)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__XOP, k_eq_8_subtile) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t m = 1; m <= 1; m++) {
-      for (uint32_t n = 1; n <= 4; n++) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(8)
-          .iterations(1)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__XOP, k_eq_8_subtile_m) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t m = 1; m <= 1; m++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(m)
-        .n(4)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__XOP, k_eq_8_subtile_n) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t n = 1; n <= 4; n++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(n)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__XOP, k_lt_8) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__XOP, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__XOP, k_lt_8_subtile) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 1; k < 8; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(1)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__XOP, k_gt_8) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__XOP, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__XOP, k_gt_8_subtile) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 9; k < 16; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(1)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__XOP, k_div_8) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__XOP, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__XOP, k_div_8_subtile) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(1)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__XOP, n_gt_4) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(4)
-          .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__XOP, n_gt_4_strided_cn) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(4)
-          .k(k)
-          .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__XOP, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__XOP, n_gt_4_subtile) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(1)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__XOP, n_div_4) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(4)
-          .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__XOP, n_div_4_strided_cn) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__XOP, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__XOP, n_div_4_subtile) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(1)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__XOP, strided_cm_subtile) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(1)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(7)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__XOP, strided_cm) {
-    TEST_REQUIRES_X86_XOP;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(1)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(4)
-      .k(8)
-      .cm_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__XOP, k_eq_8) {
-    TEST_REQUIRES_X86_XOP;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(2)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(4)
-      .k(8)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__XOP, strided_cn) {
-    TEST_REQUIRES_X86_XOP;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(2)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(4)
-      .k(8)
-      .cn_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__XOP, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(2)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__XOP, k_eq_8_subtile) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t m = 1; m <= 2; m++) {
-      for (uint32_t n = 1; n <= 4; n++) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(2)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(8)
-          .iterations(1)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__XOP, k_eq_8_subtile_m) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t m = 1; m <= 2; m++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(2)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(m)
-        .n(4)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__XOP, k_eq_8_subtile_n) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t n = 1; n <= 4; n++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(2)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(n)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__XOP, k_lt_8) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(2)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__XOP, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(2)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__XOP, k_lt_8_subtile) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 1; k < 8; k++) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(2)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__XOP, k_gt_8) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(2)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__XOP, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(2)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__XOP, k_gt_8_subtile) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 9; k < 16; k++) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(2)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__XOP, k_div_8) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(2)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__XOP, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(2)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__XOP, k_div_8_subtile) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(2)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__XOP, n_gt_4) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(2)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(4)
-          .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__XOP, n_gt_4_strided_cn) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(2)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(4)
-          .k(k)
-          .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__XOP, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(2)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__XOP, n_gt_4_subtile) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(2)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__XOP, n_div_4) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(2)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(4)
-          .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__XOP, n_div_4_strided_cn) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(2)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__XOP, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(2)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__XOP, n_div_4_subtile) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(2)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__XOP, strided_cm_subtile) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(2)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(7)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__XOP, strided_cm) {
-    TEST_REQUIRES_X86_XOP;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(2)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(4)
-      .k(8)
-      .cm_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__XOP, k_eq_8) {
-    TEST_REQUIRES_X86_XOP;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(3)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(3)
-      .n(4)
-      .k(8)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__XOP, strided_cn) {
-    TEST_REQUIRES_X86_XOP;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(3)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(3)
-      .n(4)
-      .k(8)
-      .cn_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__XOP, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(3)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(3)
-      .n(4)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__XOP, k_eq_8_subtile) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t m = 1; m <= 3; m++) {
-      for (uint32_t n = 1; n <= 4; n++) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(3)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(8)
-          .iterations(1)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__XOP, k_eq_8_subtile_m) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t m = 1; m <= 3; m++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(3)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(m)
-        .n(4)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__XOP, k_eq_8_subtile_n) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t n = 1; n <= 4; n++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(3)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(n)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__XOP, k_lt_8) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(3)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__XOP, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(3)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__XOP, k_lt_8_subtile) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 1; k < 8; k++) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(3)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__XOP, k_gt_8) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(3)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__XOP, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(3)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__XOP, k_gt_8_subtile) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 9; k < 16; k++) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(3)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__XOP, k_div_8) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(3)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__XOP, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(3)
-        .nr(4)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(4)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__XOP, k_div_8_subtile) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(3)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__XOP, n_gt_4) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(3)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(3)
-          .n(4)
-          .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__XOP, n_gt_4_strided_cn) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(3)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(3)
-          .n(4)
-          .k(k)
-          .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__XOP, n_gt_4_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(3)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__XOP, n_gt_4_subtile) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t n = 5; n < 8; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(3)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__XOP, n_div_4) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(3)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(3)
-          .n(4)
-          .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__XOP, n_div_4_strided_cn) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(3)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .cn_stride(7)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__XOP, n_div_4_strided_a) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(3)
-          .nr(4)
-          .kr(8)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__XOP, n_div_4_subtile) {
-    TEST_REQUIRES_X86_XOP;
-    for (uint32_t n = 8; n <= 12; n += 4) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(3)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__XOP, strided_cm_subtile) {
-    TEST_REQUIRES_X86_XOP;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 4; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(3)
-            .nr(4)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(7)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__XOP, strided_cm) {
-    TEST_REQUIRES_X86_XOP;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(3)
-      .nr(4)
-      .kr(8)
-      .sr(1)
-      .m(3)
-      .n(4)
-      .k(8)
-      .cm_stride(7)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AVX2, k_eq_8) {
     TEST_REQUIRES_X86_AVX2;
     GemmMicrokernelTester()
@@ -50564,1365 +39239,6 @@
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X8C8__AVX2, k_eq_8) {
-    TEST_REQUIRES_X86_AVX2;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(1)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(8)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X8C8__AVX2, strided_cn) {
-    TEST_REQUIRES_X86_AVX2;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(1)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(8)
-      .cn_stride(11)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X8C8__AVX2, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_AVX2;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(1)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X8C8__AVX2, k_eq_8_subtile) {
-    TEST_REQUIRES_X86_AVX2;
-    for (uint32_t m = 1; m <= 1; m++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(8)
-          .iterations(1)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X8C8__AVX2, k_eq_8_subtile_m) {
-    TEST_REQUIRES_X86_AVX2;
-    for (uint32_t m = 1; m <= 1; m++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(m)
-        .n(8)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X8C8__AVX2, k_eq_8_subtile_n) {
-    TEST_REQUIRES_X86_AVX2;
-    for (uint32_t n = 1; n <= 8; n++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(n)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X8C8__AVX2, k_lt_8) {
-    TEST_REQUIRES_X86_AVX2;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X8C8__AVX2, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_AVX2;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X8C8__AVX2, k_lt_8_subtile) {
-    TEST_REQUIRES_X86_AVX2;
-    for (size_t k = 1; k < 8; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(1)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X8C8__AVX2, k_gt_8) {
-    TEST_REQUIRES_X86_AVX2;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X8C8__AVX2, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_AVX2;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X8C8__AVX2, k_gt_8_subtile) {
-    TEST_REQUIRES_X86_AVX2;
-    for (size_t k = 9; k < 16; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(1)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X8C8__AVX2, k_div_8) {
-    TEST_REQUIRES_X86_AVX2;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X8C8__AVX2, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_AVX2;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X8C8__AVX2, k_div_8_subtile) {
-    TEST_REQUIRES_X86_AVX2;
-    for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(1)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X8C8__AVX2, n_gt_8) {
-    TEST_REQUIRES_X86_AVX2;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X8C8__AVX2, n_gt_8_strided_cn) {
-    TEST_REQUIRES_X86_AVX2;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X8C8__AVX2, n_gt_8_strided_a) {
-    TEST_REQUIRES_X86_AVX2;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X8C8__AVX2, n_gt_8_subtile) {
-    TEST_REQUIRES_X86_AVX2;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(1)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X8C8__AVX2, n_div_8) {
-    TEST_REQUIRES_X86_AVX2;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X8C8__AVX2, n_div_8_strided_cn) {
-    TEST_REQUIRES_X86_AVX2;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X8C8__AVX2, n_div_8_strided_a) {
-    TEST_REQUIRES_X86_AVX2;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X8C8__AVX2, n_div_8_subtile) {
-    TEST_REQUIRES_X86_AVX2;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(1)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X8C8__AVX2, strided_cm_subtile) {
-    TEST_REQUIRES_X86_AVX2;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(1)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(11)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X8C8__AVX2, strided_cm) {
-    TEST_REQUIRES_X86_AVX2;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(1)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(8)
-      .cm_stride(11)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X8C8__AVX2, k_eq_8) {
-    TEST_REQUIRES_X86_AVX2;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(2)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(8)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X8C8__AVX2, strided_cn) {
-    TEST_REQUIRES_X86_AVX2;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(2)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(8)
-      .cn_stride(11)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X8C8__AVX2, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_AVX2;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(2)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X8C8__AVX2, k_eq_8_subtile) {
-    TEST_REQUIRES_X86_AVX2;
-    for (uint32_t m = 1; m <= 2; m++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(8)
-          .iterations(1)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X8C8__AVX2, k_eq_8_subtile_m) {
-    TEST_REQUIRES_X86_AVX2;
-    for (uint32_t m = 1; m <= 2; m++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(m)
-        .n(8)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X8C8__AVX2, k_eq_8_subtile_n) {
-    TEST_REQUIRES_X86_AVX2;
-    for (uint32_t n = 1; n <= 8; n++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(n)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X8C8__AVX2, k_lt_8) {
-    TEST_REQUIRES_X86_AVX2;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X8C8__AVX2, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_AVX2;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X8C8__AVX2, k_lt_8_subtile) {
-    TEST_REQUIRES_X86_AVX2;
-    for (size_t k = 1; k < 8; k++) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X8C8__AVX2, k_gt_8) {
-    TEST_REQUIRES_X86_AVX2;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X8C8__AVX2, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_AVX2;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X8C8__AVX2, k_gt_8_subtile) {
-    TEST_REQUIRES_X86_AVX2;
-    for (size_t k = 9; k < 16; k++) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X8C8__AVX2, k_div_8) {
-    TEST_REQUIRES_X86_AVX2;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X8C8__AVX2, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_AVX2;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X8C8__AVX2, k_div_8_subtile) {
-    TEST_REQUIRES_X86_AVX2;
-    for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X8C8__AVX2, n_gt_8) {
-    TEST_REQUIRES_X86_AVX2;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X8C8__AVX2, n_gt_8_strided_cn) {
-    TEST_REQUIRES_X86_AVX2;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X8C8__AVX2, n_gt_8_strided_a) {
-    TEST_REQUIRES_X86_AVX2;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X8C8__AVX2, n_gt_8_subtile) {
-    TEST_REQUIRES_X86_AVX2;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X8C8__AVX2, n_div_8) {
-    TEST_REQUIRES_X86_AVX2;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X8C8__AVX2, n_div_8_strided_cn) {
-    TEST_REQUIRES_X86_AVX2;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X8C8__AVX2, n_div_8_strided_a) {
-    TEST_REQUIRES_X86_AVX2;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X8C8__AVX2, n_div_8_subtile) {
-    TEST_REQUIRES_X86_AVX2;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X8C8__AVX2, strided_cm_subtile) {
-    TEST_REQUIRES_X86_AVX2;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(11)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X8C8__AVX2, strided_cm) {
-    TEST_REQUIRES_X86_AVX2;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(2)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(8)
-      .cm_stride(11)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X8C8__AVX2, k_eq_8) {
-    TEST_REQUIRES_X86_AVX2;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(3)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(3)
-      .n(8)
-      .k(8)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X8C8__AVX2, strided_cn) {
-    TEST_REQUIRES_X86_AVX2;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(3)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(3)
-      .n(8)
-      .k(8)
-      .cn_stride(11)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X8C8__AVX2, k_eq_8_strided_a) {
-    TEST_REQUIRES_X86_AVX2;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(3)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(3)
-      .n(8)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X8C8__AVX2, k_eq_8_subtile) {
-    TEST_REQUIRES_X86_AVX2;
-    for (uint32_t m = 1; m <= 3; m++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(3)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(8)
-          .iterations(1)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X8C8__AVX2, k_eq_8_subtile_m) {
-    TEST_REQUIRES_X86_AVX2;
-    for (uint32_t m = 1; m <= 3; m++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(3)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(m)
-        .n(8)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X8C8__AVX2, k_eq_8_subtile_n) {
-    TEST_REQUIRES_X86_AVX2;
-    for (uint32_t n = 1; n <= 8; n++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(3)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(n)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X8C8__AVX2, k_lt_8) {
-    TEST_REQUIRES_X86_AVX2;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(3)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X8C8__AVX2, k_lt_8_strided_a) {
-    TEST_REQUIRES_X86_AVX2;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(3)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(8)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X8C8__AVX2, k_lt_8_subtile) {
-    TEST_REQUIRES_X86_AVX2;
-    for (size_t k = 1; k < 8; k++) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(3)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X8C8__AVX2, k_gt_8) {
-    TEST_REQUIRES_X86_AVX2;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(3)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X8C8__AVX2, k_gt_8_strided_a) {
-    TEST_REQUIRES_X86_AVX2;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(3)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(8)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X8C8__AVX2, k_gt_8_subtile) {
-    TEST_REQUIRES_X86_AVX2;
-    for (size_t k = 9; k < 16; k++) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(3)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X8C8__AVX2, k_div_8) {
-    TEST_REQUIRES_X86_AVX2;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(3)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X8C8__AVX2, k_div_8_strided_a) {
-    TEST_REQUIRES_X86_AVX2;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .extended_weights(true)
-        .mr(3)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(8)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X8C8__AVX2, k_div_8_subtile) {
-    TEST_REQUIRES_X86_AVX2;
-    for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(3)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X8C8__AVX2, n_gt_8) {
-    TEST_REQUIRES_X86_AVX2;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(3)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(3)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X8C8__AVX2, n_gt_8_strided_cn) {
-    TEST_REQUIRES_X86_AVX2;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(3)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(3)
-          .n(8)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X8C8__AVX2, n_gt_8_strided_a) {
-    TEST_REQUIRES_X86_AVX2;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(3)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X8C8__AVX2, n_gt_8_subtile) {
-    TEST_REQUIRES_X86_AVX2;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(3)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X8C8__AVX2, n_div_8) {
-    TEST_REQUIRES_X86_AVX2;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(3)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(3)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X8C8__AVX2, n_div_8_strided_cn) {
-    TEST_REQUIRES_X86_AVX2;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(3)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X8C8__AVX2, n_div_8_strided_a) {
-    TEST_REQUIRES_X86_AVX2;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .extended_weights(true)
-          .mr(3)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X8C8__AVX2, n_div_8_subtile) {
-    TEST_REQUIRES_X86_AVX2;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(3)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X8C8__AVX2, strided_cm_subtile) {
-    TEST_REQUIRES_X86_AVX2;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .extended_weights(true)
-            .mr(3)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(11)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X8C8__AVX2, strided_cm) {
-    TEST_REQUIRES_X86_AVX2;
-    GemmMicrokernelTester()
-      .extended_weights(true)
-      .mr(3)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(3)
-      .n(8)
-      .k(8)
-      .cm_stride(11)
-      .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__AVX512SKX, k_eq_8) {
     TEST_REQUIRES_X86_AVX512SKX;
     GemmMicrokernelTester()
diff --git a/test/qs8-gemm-minmax-gemmlowp.yaml b/test/qs8-gemm-minmax-gemmlowp.yaml
index 248f61d..65a6dd7 100644
--- a/test/qs8-gemm-minmax-gemmlowp.yaml
+++ b/test/qs8-gemm-minmax-gemmlowp.yaml
@@ -243,36 +243,6 @@
 - name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x4c2__sse41_ld64
   init: xnn_init_qs8_conv_minmax_gemmlowp_sse4_params
   k-block: 8
-- name: xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse2
-  init: xnn_init_qs8_conv_minmax_gemmlowp_sse2_params
-  k-block: 8
-- name: xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse2
-  init: xnn_init_qs8_conv_minmax_gemmlowp_sse2_params
-  k-block: 8
-- name: xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__ssse3
-  init: xnn_init_qs8_conv_minmax_gemmlowp_sse2_params
-  k-block: 8
-- name: xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__ssse3
-  init: xnn_init_qs8_conv_minmax_gemmlowp_sse2_params
-  k-block: 8
-- name: xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse41
-  init: xnn_init_qs8_conv_minmax_gemmlowp_sse4_params
-  k-block: 8
-- name: xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse41
-  init: xnn_init_qs8_conv_minmax_gemmlowp_sse4_params
-  k-block: 8
-- name: xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__avx
-  init: xnn_init_qs8_conv_minmax_gemmlowp_sse4_params
-  k-block: 8
-- name: xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__avx
-  init: xnn_init_qs8_conv_minmax_gemmlowp_sse4_params
-  k-block: 8
-- name: xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__xop
-  init: xnn_init_qs8_conv_minmax_gemmlowp_sse4_params
-  k-block: 8
-- name: xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__xop
-  init: xnn_init_qs8_conv_minmax_gemmlowp_sse4_params
-  k-block: 8
 - name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c8__sse2_ld64
   init: xnn_init_qs8_conv_minmax_gemmlowp_sse2_params
   k-block: 8
@@ -282,51 +252,6 @@
 - name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c8__sse41_ld64
   init: xnn_init_qs8_conv_minmax_gemmlowp_sse4_params
   k-block: 8
-- name: xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse2
-  init: xnn_init_qs8_conv_minmax_gemmlowp_sse2_params
-  k-block: 8
-- name: xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse2
-  init: xnn_init_qs8_conv_minmax_gemmlowp_sse2_params
-  k-block: 8
-- name: xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse2
-  init: xnn_init_qs8_conv_minmax_gemmlowp_sse2_params
-  k-block: 8
-- name: xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__ssse3
-  init: xnn_init_qs8_conv_minmax_gemmlowp_sse2_params
-  k-block: 8
-- name: xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__ssse3
-  init: xnn_init_qs8_conv_minmax_gemmlowp_sse2_params
-  k-block: 8
-- name: xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__ssse3
-  init: xnn_init_qs8_conv_minmax_gemmlowp_sse2_params
-  k-block: 8
-- name: xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse41
-  init: xnn_init_qs8_conv_minmax_gemmlowp_sse4_params
-  k-block: 8
-- name: xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse41
-  init: xnn_init_qs8_conv_minmax_gemmlowp_sse4_params
-  k-block: 8
-- name: xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse41
-  init: xnn_init_qs8_conv_minmax_gemmlowp_sse4_params
-  k-block: 8
-- name: xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__avx
-  init: xnn_init_qs8_conv_minmax_gemmlowp_sse4_params
-  k-block: 8
-- name: xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__avx
-  init: xnn_init_qs8_conv_minmax_gemmlowp_sse4_params
-  k-block: 8
-- name: xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__avx
-  init: xnn_init_qs8_conv_minmax_gemmlowp_sse4_params
-  k-block: 8
-- name: xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__xop
-  init: xnn_init_qs8_conv_minmax_gemmlowp_sse4_params
-  k-block: 8
-- name: xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__xop
-  init: xnn_init_qs8_conv_minmax_gemmlowp_sse4_params
-  k-block: 8
-- name: xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__xop
-  init: xnn_init_qs8_conv_minmax_gemmlowp_sse4_params
-  k-block: 8
 - name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__avx2
   init: xnn_init_qs8_conv_minmax_gemmlowp_avx2_params
   k-block: 8
@@ -336,15 +261,6 @@
 - name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__avx2
   init: xnn_init_qs8_conv_minmax_gemmlowp_avx2_params
   k-block: 8
-- name: xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x8c8__avx2
-  init: xnn_init_qs8_conv_minmax_gemmlowp_avx2_params
-  k-block: 8
-- name: xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x8c8__avx2
-  init: xnn_init_qs8_conv_minmax_gemmlowp_avx2_params
-  k-block: 8
-- name: xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x8c8__avx2
-  init: xnn_init_qs8_conv_minmax_gemmlowp_avx2_params
-  k-block: 8
 - name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__avx512skx
   init: xnn_init_qs8_conv_minmax_gemmlowp_avx512_params
   k-block: 8