NEON implementations of QS8 2x8c16 GEMM and IGEMM assembly microkernels

PiperOrigin-RevId: 382565189
diff --git a/BUILD.bazel b/BUILD.bazel
index e3ce51a..86b3df3 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -1489,69 +1489,69 @@
     "src/math/roundz-wasmsimd-cvt.c",
     "src/math/sigmoid-wasmsimd-rr2-lut64-p2-div.c",
     "src/math/sigmoid-wasmsimd-rr2-p5-div.c",
-    "src/qc8-dwconv/gen/up16x25-minmax-fp32-wasmsimd-mul16.c",
-    "src/qc8-dwconv/gen/up16x9-minmax-fp32-wasmsimd-mul16.c",
-    "src/qc8-dwconv/gen/up24x25-minmax-fp32-wasmsimd-mul16.c",
-    "src/qc8-dwconv/gen/up24x9-minmax-fp32-wasmsimd-mul16.c",
-    "src/qc8-dwconv/gen/up8x25-minmax-fp32-wasmsimd-mul16.c",
     "src/qc8-dwconv/gen/up8x9-minmax-fp32-wasmsimd-mul16.c",
-    "src/qc8-gemm/gen/1x4c8-minmax-fp32-wasmsimd-ld128.c",
+    "src/qc8-dwconv/gen/up8x25-minmax-fp32-wasmsimd-mul16.c",
+    "src/qc8-dwconv/gen/up16x9-minmax-fp32-wasmsimd-mul16.c",
+    "src/qc8-dwconv/gen/up16x25-minmax-fp32-wasmsimd-mul16.c",
+    "src/qc8-dwconv/gen/up24x9-minmax-fp32-wasmsimd-mul16.c",
+    "src/qc8-dwconv/gen/up24x25-minmax-fp32-wasmsimd-mul16.c",
     "src/qc8-gemm/gen/1x4c8-minmax-fp32-wasmsimd-ld64.c",
-    "src/qc8-gemm/gen/2x4c8-minmax-fp32-wasmsimd-ld128.c",
+    "src/qc8-gemm/gen/1x4c8-minmax-fp32-wasmsimd-ld128.c",
     "src/qc8-gemm/gen/2x4c8-minmax-fp32-wasmsimd-ld64.c",
-    "src/qc8-gemm/gen/3x4c8-minmax-fp32-wasmsimd-ld128.c",
+    "src/qc8-gemm/gen/2x4c8-minmax-fp32-wasmsimd-ld128.c",
     "src/qc8-gemm/gen/3x4c8-minmax-fp32-wasmsimd-ld64.c",
-    "src/qc8-igemm/gen/1x4c8-minmax-fp32-wasmsimd-ld128.c",
+    "src/qc8-gemm/gen/3x4c8-minmax-fp32-wasmsimd-ld128.c",
     "src/qc8-igemm/gen/1x4c8-minmax-fp32-wasmsimd-ld64.c",
-    "src/qc8-igemm/gen/2x4c8-minmax-fp32-wasmsimd-ld128.c",
+    "src/qc8-igemm/gen/1x4c8-minmax-fp32-wasmsimd-ld128.c",
     "src/qc8-igemm/gen/2x4c8-minmax-fp32-wasmsimd-ld64.c",
-    "src/qc8-igemm/gen/3x4c8-minmax-fp32-wasmsimd-ld128.c",
+    "src/qc8-igemm/gen/2x4c8-minmax-fp32-wasmsimd-ld128.c",
     "src/qc8-igemm/gen/3x4c8-minmax-fp32-wasmsimd-ld64.c",
-    "src/qs8-dwconv/gen/up16x25-minmax-fp32-wasmsimd-mul16.c",
-    "src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-wasmsimd-mul16.c",
-    "src/qs8-dwconv/gen/up16x9-minmax-fp32-wasmsimd-mul16.c",
-    "src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-wasmsimd-mul16.c",
-    "src/qs8-dwconv/gen/up24x25-minmax-fp32-wasmsimd-mul16.c",
-    "src/qs8-dwconv/gen/up24x25-minmax-gemmlowp-wasmsimd-mul16.c",
-    "src/qs8-dwconv/gen/up24x9-minmax-fp32-wasmsimd-mul16.c",
-    "src/qs8-dwconv/gen/up24x9-minmax-gemmlowp-wasmsimd-mul16.c",
-    "src/qs8-dwconv/gen/up8x25-minmax-fp32-wasmsimd-mul16.c",
-    "src/qs8-dwconv/gen/up8x25-minmax-gemmlowp-wasmsimd-mul16.c",
+    "src/qc8-igemm/gen/3x4c8-minmax-fp32-wasmsimd-ld128.c",
     "src/qs8-dwconv/gen/up8x9-minmax-fp32-wasmsimd-mul16.c",
     "src/qs8-dwconv/gen/up8x9-minmax-gemmlowp-wasmsimd-mul16.c",
+    "src/qs8-dwconv/gen/up8x25-minmax-fp32-wasmsimd-mul16.c",
+    "src/qs8-dwconv/gen/up8x25-minmax-gemmlowp-wasmsimd-mul16.c",
+    "src/qs8-dwconv/gen/up16x9-minmax-fp32-wasmsimd-mul16.c",
+    "src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-wasmsimd-mul16.c",
+    "src/qs8-dwconv/gen/up16x25-minmax-fp32-wasmsimd-mul16.c",
+    "src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-wasmsimd-mul16.c",
+    "src/qs8-dwconv/gen/up24x9-minmax-fp32-wasmsimd-mul16.c",
+    "src/qs8-dwconv/gen/up24x9-minmax-gemmlowp-wasmsimd-mul16.c",
+    "src/qs8-dwconv/gen/up24x25-minmax-fp32-wasmsimd-mul16.c",
+    "src/qs8-dwconv/gen/up24x25-minmax-gemmlowp-wasmsimd-mul16.c",
     "src/qs8-gavgpool/gen/7p7x-minmax-wasmsimd-c8-acc2.c",
     "src/qs8-gavgpool/gen/7p7x-minmax-wasmsimd-c16-acc2.c",
     "src/qs8-gavgpool/gen/7p7x-minmax-wasmsimd-c24-acc2.c",
     "src/qs8-gavgpool/gen/7x-minmax-wasmsimd-c8-acc2.c",
     "src/qs8-gavgpool/gen/7x-minmax-wasmsimd-c16-acc2.c",
     "src/qs8-gavgpool/gen/7x-minmax-wasmsimd-c24-acc2.c",
-    "src/qs8-gemm/gen/1x4c8-minmax-fp32-wasmsimd-ld128.c",
     "src/qs8-gemm/gen/1x4c8-minmax-fp32-wasmsimd-ld64.c",
-    "src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-wasmsimd-ld128.c",
+    "src/qs8-gemm/gen/1x4c8-minmax-fp32-wasmsimd-ld128.c",
     "src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-wasmsimd-ld64.c",
+    "src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-wasmsimd-ld128.c",
     "src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-wasmsimd.c",
-    "src/qs8-gemm/gen/2x4c8-minmax-fp32-wasmsimd-ld128.c",
     "src/qs8-gemm/gen/2x4c8-minmax-fp32-wasmsimd-ld64.c",
-    "src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-wasmsimd-ld128.c",
+    "src/qs8-gemm/gen/2x4c8-minmax-fp32-wasmsimd-ld128.c",
     "src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-wasmsimd-ld64.c",
+    "src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-wasmsimd-ld128.c",
     "src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-wasmsimd.c",
-    "src/qs8-gemm/gen/3x4c8-minmax-fp32-wasmsimd-ld128.c",
     "src/qs8-gemm/gen/3x4c8-minmax-fp32-wasmsimd-ld64.c",
-    "src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-wasmsimd-ld128.c",
+    "src/qs8-gemm/gen/3x4c8-minmax-fp32-wasmsimd-ld128.c",
     "src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-wasmsimd-ld64.c",
+    "src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-wasmsimd-ld128.c",
     "src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-wasmsimd.c",
-    "src/qs8-igemm/gen/1x4c8-minmax-fp32-wasmsimd-ld128.c",
     "src/qs8-igemm/gen/1x4c8-minmax-fp32-wasmsimd-ld64.c",
-    "src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-wasmsimd-ld128.c",
+    "src/qs8-igemm/gen/1x4c8-minmax-fp32-wasmsimd-ld128.c",
     "src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-wasmsimd-ld64.c",
-    "src/qs8-igemm/gen/2x4c8-minmax-fp32-wasmsimd-ld128.c",
+    "src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-wasmsimd-ld128.c",
     "src/qs8-igemm/gen/2x4c8-minmax-fp32-wasmsimd-ld64.c",
-    "src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-wasmsimd-ld128.c",
+    "src/qs8-igemm/gen/2x4c8-minmax-fp32-wasmsimd-ld128.c",
     "src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-wasmsimd-ld64.c",
-    "src/qs8-igemm/gen/3x4c8-minmax-fp32-wasmsimd-ld128.c",
+    "src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-wasmsimd-ld128.c",
     "src/qs8-igemm/gen/3x4c8-minmax-fp32-wasmsimd-ld64.c",
-    "src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-wasmsimd-ld128.c",
+    "src/qs8-igemm/gen/3x4c8-minmax-fp32-wasmsimd-ld128.c",
     "src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-wasmsimd-ld64.c",
+    "src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-wasmsimd-ld128.c",
     "src/qs8-requantization/fp32-wasmsimd.c",
     "src/qs8-requantization/gemmlowp-wasmsimd.c",
     "src/qs8-vadd/gen/minmax-wasmsimd-x8.c",
@@ -4386,12 +4386,14 @@
     "src/f32-igemm/gen/6x8-minmax-aarch64-neonfma-prfm-cortex-a75.S",
     "src/qc8-gemm/gen/1x16c4-minmax-fp32-aarch64-neondot-ld32.S",
     "src/qc8-gemm/gen/1x16c4-minmax-fp32-aarch64-neondot-ld64.S",
+    "src/qc8-gemm/gen/2x8c16-minmax-fp32-aarch64-neon-mlal-padal.S",
     "src/qc8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-cortex-a53.S",
     "src/qc8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-cortex-a53.S",
     "src/qc8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-cortex-a55.S",
     "src/qc8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld32.S",
     "src/qc8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld64.S",
     "src/qc8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld128.S",
+    "src/qc8-igemm/gen/2x8c16-minmax-fp32-aarch64-neon-mlal-padal.S",
     "src/qc8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-cortex-a53.S",
     "src/qc8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-cortex-a53.S",
     "src/qc8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-cortex-a55.S",
@@ -4410,6 +4412,7 @@
     "src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm.S",
     "src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal.S",
     "src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mull-padal.S",
+    "src/qs8-gemm/gen/2x8c16-minmax-fp32-aarch64-neon-mlal-padal.S",
     "src/qs8-gemm/gen/2x8c16-minmax-gemmlowp-aarch64-neon-mlal-padal.S",
     "src/qs8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-cortex-a53.S",
     "src/qs8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-cortex-a53.S",
@@ -4431,6 +4434,7 @@
     "src/qs8-igemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm-cortex-a53.S",
     "src/qs8-igemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm.S",
     "src/qs8-igemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal.S",
+    "src/qs8-igemm/gen/2x8c16-minmax-fp32-aarch64-neon-mlal-padal.S",
     "src/qs8-igemm/gen/2x8c16-minmax-gemmlowp-aarch64-neon-mlal-padal.S",
     "src/qs8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-cortex-a53.S",
     "src/qs8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-cortex-a53.S",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2a2a430..a862b30 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3572,12 +3572,14 @@
   src/f32-igemm/gen/6x8-minmax-aarch64-neonfma-prfm-cortex-a75.S
   src/qc8-gemm/gen/1x16c4-minmax-fp32-aarch64-neondot-ld32.S
   src/qc8-gemm/gen/1x16c4-minmax-fp32-aarch64-neondot-ld64.S
+  src/qc8-gemm/gen/2x8c16-minmax-fp32-aarch64-neon-mlal-padal.S
   src/qc8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-cortex-a53.S
   src/qc8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-cortex-a53.S
   src/qc8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-cortex-a55.S
   src/qc8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld32.S
   src/qc8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld64.S
   src/qc8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld128.S
+  src/qc8-igemm/gen/2x8c16-minmax-fp32-aarch64-neon-mlal-padal.S
   src/qc8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-cortex-a53.S
   src/qc8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-cortex-a53.S
   src/qc8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-cortex-a55.S
@@ -3596,6 +3598,7 @@
   src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm.S
   src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal.S
   src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mull-padal.S
+  src/qs8-gemm/gen/2x8c16-minmax-fp32-aarch64-neon-mlal-padal.S
   src/qs8-gemm/gen/2x8c16-minmax-gemmlowp-aarch64-neon-mlal-padal.S
   src/qs8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-cortex-a53.S
   src/qs8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-cortex-a53.S
@@ -3617,6 +3620,7 @@
   src/qs8-igemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm-cortex-a53.S
   src/qs8-igemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm.S
   src/qs8-igemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal.S
+  src/qs8-igemm/gen/2x8c16-minmax-fp32-aarch64-neon-mlal-padal.S
   src/qs8-igemm/gen/2x8c16-minmax-gemmlowp-aarch64-neon-mlal-padal.S
   src/qs8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-cortex-a53.S
   src/qs8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-cortex-a53.S
diff --git a/scripts/generate-qs8-gemm.sh b/scripts/generate-qs8-gemm.sh
index de492eb..8135073 100755
--- a/scripts/generate-qs8-gemm.sh
+++ b/scripts/generate-qs8-gemm.sh
@@ -232,11 +232,6 @@
 
 ############################### AArch64 assembly ##############################
 # Cortex A53 micro-kernel
-tools/xngen src/qs8-gemm/1x8c8-aarch64-neon-mlal-padal-cortex-a53.S.in -D PREFETCH=0 -o src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-cortex-a53.S
-tools/xngen src/qs8-gemm/1x8c8-aarch64-neon-mlal-padal-cortex-a53.S.in -D PREFETCH=1 -o src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm-cortex-a53.S
-tools/xngen src/qs8-gemm/2x8c8-aarch64-neon-mlal-padal-cortex-a53.S.in -D PREFETCH=0 -o src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-cortex-a53.S
-tools/xngen src/qs8-gemm/2x8c8-aarch64-neon-mlal-padal-cortex-a53.S.in -D PREFETCH=1 -o src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm-cortex-a53.S
-
 tools/xngen src/qs8-gemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in   -D PREFETCH=0 -D REQUANTIZATION=GEMMLOWP -D CHANNELWISE=0 -o src/qs8-gemm/gen/4x16-minmax-gemmlowp-aarch64-neon-mlal-lane-cortex-a53.S
 tools/xngen src/qs8-gemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in   -D PREFETCH=1 -D REQUANTIZATION=GEMMLOWP -D CHANNELWISE=0 -o src/qs8-gemm/gen/4x16-minmax-gemmlowp-aarch64-neon-mlal-lane-prfm-cortex-a53.S
 
@@ -246,15 +241,6 @@
 tools/xngen src/qs8-gemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in   -D PREFETCH=0 -D REQUANTIZATION=FP32     -D CHANNELWISE=1 -o src/qc8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-cortex-a53.S
 tools/xngen src/qs8-gemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in   -D PREFETCH=1 -D REQUANTIZATION=FP32     -D CHANNELWISE=1 -o src/qc8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-cortex-a53.S
 
-# Cortex A75 micro-kernel
-tools/xngen src/qs8-gemm/1x8c8-aarch64-neon-mlal-padal.S.in -D PREFETCH=0 -o src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal.S
-tools/xngen src/qs8-gemm/1x8c8-aarch64-neon-mlal-padal.S.in -D PREFETCH=1 -o src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm.S
-tools/xngen src/qs8-gemm/2x8c8-aarch64-neon-mlal-padal.S.in -D PREFETCH=0 -o src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal.S
-tools/xngen src/qs8-gemm/2x8c8-aarch64-neon-mlal-padal.S.in -D PREFETCH=1 -o src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm.S
-
-tools/xngen src/qs8-gemm/2x8c16-aarch64-neon-mlal-padal.S.in    -o src/qs8-gemm/gen/2x8c16-minmax-gemmlowp-aarch64-neon-mlal-padal.S
-tools/xngen src/qs8-gemm/2x8c8-aarch64-neon-mull-padal.S.in     -o src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mull-padal.S
-
 ### C4 micro-kernels
 tools/xngen src/qs8-gemm/1x16c4-aarch64-neondot-ld32.S.in       -D REQUANTIZATION=GEMMLOWP -D CHANNELWISE=0 -o src/qs8-gemm/gen/1x16c4-minmax-gemmlowp-aarch64-neondot-ld32.S
 tools/xngen src/qs8-gemm/1x16c4-aarch64-neondot-ld64.S.in       -D REQUANTIZATION=GEMMLOWP -D CHANNELWISE=0 -o src/qs8-gemm/gen/1x16c4-minmax-gemmlowp-aarch64-neondot-ld64.S
@@ -277,6 +263,23 @@
 tools/xngen src/qs8-gemm/4x16c4-aarch64-neondot-ld64.S.in       -D REQUANTIZATION=FP32     -D CHANNELWISE=1 -o src/qc8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld64.S
 tools/xngen src/qs8-gemm/4x16c4-aarch64-neondot-ld128.S.in      -D REQUANTIZATION=FP32     -D CHANNELWISE=1 -o src/qc8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld128.S
 
+### C8 / C16 micro-kernels
+tools/xngen src/qs8-gemm/1x8c8-aarch64-neon-mlal-padal-cortex-a53.S.in -D PREFETCH=0 -D REQUANTIZATION=GEMMLOWP -D CHANNELWISE=0 -o src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-cortex-a53.S
+tools/xngen src/qs8-gemm/1x8c8-aarch64-neon-mlal-padal-cortex-a53.S.in -D PREFETCH=1 -D REQUANTIZATION=GEMMLOWP -D CHANNELWISE=0 -o src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm-cortex-a53.S
+tools/xngen src/qs8-gemm/2x8c8-aarch64-neon-mlal-padal-cortex-a53.S.in -D PREFETCH=0 -D REQUANTIZATION=GEMMLOWP -D CHANNELWISE=0 -o src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-cortex-a53.S
+tools/xngen src/qs8-gemm/2x8c8-aarch64-neon-mlal-padal-cortex-a53.S.in -D PREFETCH=1 -D REQUANTIZATION=GEMMLOWP -D CHANNELWISE=0 -o src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm-cortex-a53.S
+tools/xngen src/qs8-gemm/1x8c8-aarch64-neon-mlal-padal.S.in            -D PREFETCH=0 -D REQUANTIZATION=GEMMLOWP -D CHANNELWISE=0 -o src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal.S
+tools/xngen src/qs8-gemm/1x8c8-aarch64-neon-mlal-padal.S.in            -D PREFETCH=1 -D REQUANTIZATION=GEMMLOWP -D CHANNELWISE=0 -o src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm.S
+tools/xngen src/qs8-gemm/2x8c8-aarch64-neon-mlal-padal.S.in            -D PREFETCH=0 -D REQUANTIZATION=GEMMLOWP -D CHANNELWISE=0 -o src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal.S
+tools/xngen src/qs8-gemm/2x8c8-aarch64-neon-mlal-padal.S.in            -D PREFETCH=1 -D REQUANTIZATION=GEMMLOWP -D CHANNELWISE=0 -o src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm.S
+tools/xngen src/qs8-gemm/2x8c8-aarch64-neon-mull-padal.S.in                          -D REQUANTIZATION=GEMMLOWP -D CHANNELWISE=0 -o src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mull-padal.S
+
+tools/xngen src/qs8-gemm/2x8c16-aarch64-neon-mlal-padal.S.in                         -D REQUANTIZATION=GEMMLOWP -D CHANNELWISE=0 -o src/qs8-gemm/gen/2x8c16-minmax-gemmlowp-aarch64-neon-mlal-padal.S
+
+tools/xngen src/qs8-gemm/2x8c16-aarch64-neon-mlal-padal.S.in                         -D REQUANTIZATION=FP32 -D CHANNELWISE=0 -o src/qs8-gemm/gen/2x8c16-minmax-fp32-aarch64-neon-mlal-padal.S
+
+tools/xngen src/qs8-gemm/2x8c16-aarch64-neon-mlal-padal.S.in                         -D REQUANTIZATION=FP32 -D CHANNELWISE=1 -o src/qc8-gemm/gen/2x8c16-minmax-fp32-aarch64-neon-mlal-padal.S
+
 ################################### x86 SSE ###################################
 ### C2 micro-kernels
 tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=1 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64     -o src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-sse2-ld64.c
diff --git a/scripts/generate-qs8-igemm.sh b/scripts/generate-qs8-igemm.sh
index ee46633..1b5a661 100755
--- a/scripts/generate-qs8-igemm.sh
+++ b/scripts/generate-qs8-igemm.sh
@@ -229,11 +229,6 @@
 
 ############################### AArch64 assembly ##############################
 # Cortex A53 micro-kernel
-tools/xngen src/qs8-igemm/1x8c8-aarch64-neon-mlal-padal-cortex-a53.S.in -D PREFETCH=0 -o src/qs8-igemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-cortex-a53.S
-tools/xngen src/qs8-igemm/1x8c8-aarch64-neon-mlal-padal-cortex-a53.S.in -D PREFETCH=1 -o src/qs8-igemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm-cortex-a53.S
-tools/xngen src/qs8-igemm/2x8c8-aarch64-neon-mlal-padal-cortex-a53.S.in -D PREFETCH=0 -o src/qs8-igemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-cortex-a53.S
-tools/xngen src/qs8-igemm/2x8c8-aarch64-neon-mlal-padal-cortex-a53.S.in -D PREFETCH=1 -o src/qs8-igemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm-cortex-a53.S
-
 tools/xngen src/qs8-igemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in   -D PREFETCH=0 -D REQUANTIZATION=GEMMLOWP -D CHANNELWISE=0 -o src/qs8-igemm/gen/4x16-minmax-gemmlowp-aarch64-neon-mlal-lane-cortex-a53.S
 tools/xngen src/qs8-igemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in   -D PREFETCH=1 -D REQUANTIZATION=GEMMLOWP -D CHANNELWISE=0 -o src/qs8-igemm/gen/4x16-minmax-gemmlowp-aarch64-neon-mlal-lane-prfm-cortex-a53.S
 
@@ -243,14 +238,6 @@
 tools/xngen src/qs8-igemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in   -D PREFETCH=0 -D REQUANTIZATION=FP32     -D CHANNELWISE=1 -o src/qc8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-cortex-a53.S
 tools/xngen src/qs8-igemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in   -D PREFETCH=1 -D REQUANTIZATION=FP32     -D CHANNELWISE=1 -o src/qc8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-cortex-a53.S
 
-# Cortex A75 micro-kernel
-tools/xngen src/qs8-igemm/1x8c8-aarch64-neon-mlal-padal.S.in -D PREFETCH=0 -o src/qs8-igemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal.S
-tools/xngen src/qs8-igemm/1x8c8-aarch64-neon-mlal-padal.S.in -D PREFETCH=1 -o src/qs8-igemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm.S
-tools/xngen src/qs8-igemm/2x8c8-aarch64-neon-mlal-padal.S.in -D PREFETCH=0 -o src/qs8-igemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal.S
-tools/xngen src/qs8-igemm/2x8c8-aarch64-neon-mlal-padal.S.in -D PREFETCH=1 -o src/qs8-igemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm.S
-
-tools/xngen src/qs8-igemm/2x8c16-aarch64-neon-mlal-padal.S.in    -o src/qs8-igemm/gen/2x8c16-minmax-gemmlowp-aarch64-neon-mlal-padal.S
-
 ### C4 micro-kernels
 tools/xngen src/qs8-igemm/4x16c4-aarch64-neondot-cortex-a55.S.in -D REQUANTIZATION=GEMMLOWP -D CHANNELWISE=0 -o src/qs8-igemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-cortex-a55.S
 tools/xngen src/qs8-igemm/4x16c4-aarch64-neondot-ld64.S.in       -D REQUANTIZATION=GEMMLOWP -D CHANNELWISE=0 -o src/qs8-igemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld64.S
@@ -264,6 +251,22 @@
 tools/xngen src/qs8-igemm/4x16c4-aarch64-neondot-ld64.S.in       -D REQUANTIZATION=FP32     -D CHANNELWISE=1 -o src/qc8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld64.S
 tools/xngen src/qs8-igemm/4x16c4-aarch64-neondot-ld128.S.in      -D REQUANTIZATION=FP32     -D CHANNELWISE=1 -o src/qc8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld128.S
 
+### C8 / C16 micro-kernels
+tools/xngen src/qs8-igemm/1x8c8-aarch64-neon-mlal-padal-cortex-a53.S.in -D PREFETCH=0 -D REQUANTIZATION=GEMMLOWP -D CHANNELWISE=0 -o src/qs8-igemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-cortex-a53.S
+tools/xngen src/qs8-igemm/1x8c8-aarch64-neon-mlal-padal-cortex-a53.S.in -D PREFETCH=1 -D REQUANTIZATION=GEMMLOWP -D CHANNELWISE=0 -o src/qs8-igemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm-cortex-a53.S
+tools/xngen src/qs8-igemm/2x8c8-aarch64-neon-mlal-padal-cortex-a53.S.in -D PREFETCH=0 -D REQUANTIZATION=GEMMLOWP -D CHANNELWISE=0 -o src/qs8-igemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-cortex-a53.S
+tools/xngen src/qs8-igemm/2x8c8-aarch64-neon-mlal-padal-cortex-a53.S.in -D PREFETCH=1 -D REQUANTIZATION=GEMMLOWP -D CHANNELWISE=0 -o src/qs8-igemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm-cortex-a53.S
+tools/xngen src/qs8-igemm/1x8c8-aarch64-neon-mlal-padal.S.in            -D PREFETCH=0 -D REQUANTIZATION=GEMMLOWP -D CHANNELWISE=0 -o src/qs8-igemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal.S
+tools/xngen src/qs8-igemm/1x8c8-aarch64-neon-mlal-padal.S.in            -D PREFETCH=1 -D REQUANTIZATION=GEMMLOWP -D CHANNELWISE=0 -o src/qs8-igemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm.S
+tools/xngen src/qs8-igemm/2x8c8-aarch64-neon-mlal-padal.S.in            -D PREFETCH=0 -D REQUANTIZATION=GEMMLOWP -D CHANNELWISE=0 -o src/qs8-igemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal.S
+tools/xngen src/qs8-igemm/2x8c8-aarch64-neon-mlal-padal.S.in            -D PREFETCH=1 -D REQUANTIZATION=GEMMLOWP -D CHANNELWISE=0 -o src/qs8-igemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm.S
+tools/xngen src/qs8-igemm/2x8c16-aarch64-neon-mlal-padal.S.in                         -D REQUANTIZATION=GEMMLOWP -D CHANNELWISE=0 -o src/qs8-igemm/gen/2x8c16-minmax-gemmlowp-aarch64-neon-mlal-padal.S
+
+tools/xngen src/qs8-igemm/2x8c16-aarch64-neon-mlal-padal.S.in                         -D REQUANTIZATION=FP32     -D CHANNELWISE=0 -o src/qs8-igemm/gen/2x8c16-minmax-fp32-aarch64-neon-mlal-padal.S
+
+tools/xngen src/qs8-igemm/2x8c16-aarch64-neon-mlal-padal.S.in                         -D REQUANTIZATION=FP32     -D CHANNELWISE=1 -o src/qc8-igemm/gen/2x8c16-minmax-fp32-aarch64-neon-mlal-padal.S
+
+
 ################################### x86 SSE ###################################
 ### C2 micro-kernels
 tools/xngen src/qs8-igemm/MRx4c2-sse.c.in -D MR=1 -D SSE=2 -D AVX=0 -D XOP=0 -D CHANNELWISE=0 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64  -o src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-sse2-ld64.c
diff --git a/src/qc8-gemm/gen/2x8c16-minmax-fp32-aarch64-neon-mlal-padal.S b/src/qc8-gemm/gen/2x8c16-minmax-fp32-aarch64-neon-mlal-padal.S
new file mode 100644
index 0000000..205df60
--- /dev/null
+++ b/src/qc8-gemm/gen/2x8c16-minmax-fp32-aarch64-neon-mlal-padal.S
@@ -0,0 +1,220 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/2x8c16-aarch64-neon-mlal-padal.S.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+
+#include <xnnpack/assembly.h>
+
+# void xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal(
+#     size_t mr,                 x0
+#     size_t nc,                 x1
+#     size_t kc,                 x2 / x0
+#     const int8_t* restrict a,  x3
+#     size_t a_stride,           x4
+#     const void* restrict w,    x5
+#     int8_t* restrict c,        x6
+#     size_t cm_stride,          x7
+#     size_t cn_stride,          [sp] -> x10
+#     const union xnn_qs8_minmax_params params)  [sp + 8] -> x11
+
+# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+# Register usage
+# A0  x3  v0
+# A1  x4  v1
+# B   x5  v4  v5  v6  v7
+# C0  x7 v16 v18 v20 v22 v24 v26 v28 v30
+# C1  x8 v17 v19 v21 v23 v25 v27 v29 v31
+# temp0   v2 v10 v12 v14
+# temp1   v3 v11 v13 v15
+# unused  v8 v9
+
+BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal
+
+        # Clamp A and C pointers
+        CMP     x0, 2                   // if mr < 2
+        STP     d10, d11, [sp, -48]!
+        ADD     x4, x3, x4              // a1 = a0 + a_stride
+        STP     d12, d13, [sp, 16]
+        ADD     x7, x6, x7              // c1 = c0 + cm_stride
+        STP     d14, d15, [sp, 32]
+        CSEL    x4, x3, x4, LO          //   a1 = a0
+        ADD     x2, x2, 15              // kc = (kc + 15) & ~15
+        CSEL    x7, x6, x7, LO          //   c1 = c0
+        BIC     x2, x2, 15
+
+        .p2align 3
+0:
+        # Load initial bias from w into accumulators
+        MOV     x0, x2                  // k = kc
+        LDP     s16, s18, [x5], 8
+        MOV     v17.16b, v16.16b
+        MOV     v19.16b, v18.16b
+        LDP     s20, s22, [x5], 8
+        MOV     v21.16b, v20.16b
+        MOV     v23.16b, v22.16b
+        LDP     s24, s26, [x5], 8
+        MOV     v25.16b, v24.16b
+        MOV     v27.16b, v26.16b
+        LDP     s28, s30, [x5], 8
+        MOV     v29.16b, v28.16b
+        LDP     x10, x11, [sp, 48]       // cn_stride, params
+        MOV     v31.16b, v30.16b
+
+        # Main loop - 16 bytes of A
+        .p2align 3
+1:
+        LDR     q0, [x3], 16
+        LDP     q4, q5, [x5]
+        LDR     q1, [x4], 16
+        LDP     q6, q7, [x5, 32]
+        SMULL   v2.8h, v4.8b, v0.8b
+        SMULL   v3.8h, v4.8b, v1.8b
+        SMULL   v10.8h, v5.8b, v0.8b
+        SMULL   v11.8h, v5.8b, v1.8b
+        SMLAL2  v2.8h, v4.16b, v0.16b
+        SMLAL2  v3.8h, v4.16b, v1.16b
+        SMLAL2  v10.8h, v5.16b, v0.16b
+        SMLAL2  v11.8h, v5.16b, v1.16b
+        SMULL   v12.8h, v6.8b, v0.8b
+        SADALP  v16.4s,  v2.8h
+        SMULL   v13.8h, v6.8b, v1.8b
+        SADALP  v17.4s,  v3.8h
+        SMULL   v14.8h, v7.8b, v0.8b
+        SADALP  v18.4s, v10.8h
+        SMULL   v15.8h, v7.8b, v1.8b
+        SADALP  v19.4s, v11.8h
+        LDP     q4, q5, [x5, 64]
+        SMLAL2  v12.8h, v6.16b, v0.16b
+        SMLAL2  v13.8h, v6.16b, v1.16b
+        SMLAL2  v14.8h, v7.16b, v0.16b
+        SMLAL2  v15.8h, v7.16b, v1.16b
+        SMULL   v2.8h, v4.8b, v0.8b
+        SADALP  v20.4s, v12.8h
+        SMULL   v3.8h, v4.8b, v1.8b
+        SADALP  v21.4s, v13.8h
+        SMULL   v10.8h, v5.8b, v0.8b
+        SADALP  v22.4s, v14.8h
+        SMULL   v11.8h, v5.8b, v1.8b
+        SADALP  v23.4s, v15.8h
+        LDP     q6, q7, [x5, 96]
+
+        SMLAL2  v2.8h, v4.16b, v0.16b
+        SMLAL2  v3.8h, v4.16b, v1.16b
+        SMLAL2  v10.8h, v5.16b, v0.16b
+        SMLAL2  v11.8h, v5.16b, v1.16b
+        ADD     x5, x5, 128
+        SMULL   v12.8h, v6.8b, v0.8b
+        SADALP  v24.4s,  v2.8h
+        SMULL   v13.8h, v6.8b, v1.8b
+        SADALP  v25.4s,  v3.8h
+        SMULL   v14.8h, v7.8b, v0.8b
+        SADALP  v26.4s, v10.8h
+        SMULL   v15.8h, v7.8b, v1.8b
+        SADALP  v27.4s, v11.8h
+        SUBS    x0, x0, 16
+        SMLAL2  v12.8h, v6.16b, v0.16b
+        SMLAL2  v13.8h, v6.16b, v1.16b
+        SMLAL2  v14.8h, v7.16b, v0.16b
+        SMLAL2  v15.8h, v7.16b, v1.16b
+        SADALP  v28.4s, v12.8h
+        SADALP  v29.4s, v13.8h
+        SADALP  v30.4s, v14.8h
+        SADALP  v31.4s, v15.8h
+        B.HI    1b
+
+        # Add columns
+        ADDP    v16.4s, v16.4s, v18.4s
+        ADDP    v20.4s, v20.4s, v22.4s
+        ADDP    v24.4s, v24.4s, v26.4s
+        ADDP    v28.4s, v28.4s, v30.4s
+        ADDP    v17.4s, v17.4s, v19.4s
+        ADDP    v21.4s, v21.4s, v23.4s
+        ADDP    v25.4s, v25.4s, v27.4s
+        ADDP    v29.4s, v29.4s, v31.4s
+        ADDP    v0.4s, v16.4s, v20.4s
+        ADDP    v1.4s, v24.4s, v28.4s
+        ADDP    v2.4s, v17.4s, v21.4s
+        ADDP    v3.4s, v25.4s, v29.4s
+
+        # Load per channel scale values from weights
+        SCVTF   v0.4s, v0.4s
+        LDR     q4, [x5], 16
+        SCVTF   v1.4s, v1.4s
+        LDR     q5, [x5], 16
+        SCVTF   v2.4s, v2.4s
+        SCVTF   v3.4s, v3.4s
+        FMUL    v0.4s, v0.4s, v4.4s
+        FMUL    v1.4s, v1.4s, v5.4s
+        FMUL    v2.4s, v2.4s, v4.4s
+        FMUL    v3.4s, v3.4s, v5.4s
+
+        FCVTNS  v0.4s, v0.4s
+        FCVTNS  v1.4s, v1.4s
+        FCVTNS  v2.4s, v2.4s
+        FCVTNS  v3.4s, v3.4s
+
+        LD1R    {v5.8h}, [x11], 2
+        SQXTN   v0.4h, v0.4s
+        SQXTN   v2.4h, v2.4s
+        SQXTN2  v0.8h, v1.4s
+        SQXTN2  v2.8h, v3.4s
+        SUBS    x1, x1, 8
+        SQADD   v0.8h, v0.8h, v5.8h
+        SQADD   v1.8h, v2.8h, v5.8h
+        SQXTN   v0.8b, v0.8h
+        SQXTN2  v0.16b, v1.8h
+        LD1R    {v1.16b}, [x11], 1
+        LD1R    {v2.16b}, [x11]
+        SMAX    v0.16b, v0.16b, v1.16b
+        SMIN    v0.16b, v0.16b, v2.16b
+        B.LO    2f
+
+        # Store full 2 x 8
+        ST1     {v0.8b}, [x6], x10
+        SUB     x3, x3, x2              // a0 -= kc
+        ST1     {v0.d}[1], [x7], x10
+        SUB     x4, x4, x2              // a1 -= kc
+        B.HI    0b
+
+        # Restore d10-d15 from stack
+        LDP     d14, d15, [sp, 32]
+        LDP     d12, d13, [sp, 16]
+        LDP     d10, d11, [sp], 48
+        RET
+
+        # Store odd width
+        .p2align 3
+2:
+        TBZ     x1, 2, 3f
+        STR     s0, [x6], 4
+        ST1     {v0.s}[2], [x7], 4
+        EXT     v0.16b, v0.16b, v0.16b, 4
+
+3:
+        TBZ     x1, 1, 4f
+        ST1     {v0.h}[0], [x6], 2
+        ST1     {v0.h}[4], [x7], 2
+        EXT     v0.16b, v0.16b, v0.16b, 2
+4:
+        TBZ     x1, 0, 5f
+        ST1     {v0.b}[0], [x6]
+        ST1     {v0.b}[8], [x7]
+5:
+        # Restore d10-d15 from stack
+        LDP     d14, d15, [sp, 32]
+        LDP     d12, d13, [sp, 16]
+        LDP     d10, d11, [sp], 48
+        RET
+
+END_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
+
diff --git a/src/qc8-igemm/gen/2x8c16-minmax-fp32-aarch64-neon-mlal-padal.S b/src/qc8-igemm/gen/2x8c16-minmax-fp32-aarch64-neon-mlal-padal.S
new file mode 100644
index 0000000..c14a4c6
--- /dev/null
+++ b/src/qc8-igemm/gen/2x8c16-minmax-fp32-aarch64-neon-mlal-padal.S
@@ -0,0 +1,241 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/2x8c16-aarch64-neon-mlal-padal.S.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+
+#include <xnnpack/assembly.h>
+
+# void xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal(
+#     size_t mr,                 x0
+#     size_t nc,                 x1
+#     size_t kc,                 x2 / x0
+#     size_t ks,                 x3 / x9
+#     const int8_t**restrict a,  x4
+#     const int8_t* restrict w,  x5
+#     int8_t* restrict c,        x6
+#     size_t cm_stride,          x7
+#     size_t cn_stride,                  [sp] -> x10
+#     size_t a_offset,                   [sp + 8] -> x8
+#     const float* zero,                 [sp + 16] -> x12
+#     const union xnn_qs8_minmax_params params [sp + 24] -> x11
+
+# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+# Register usage
+# A0 x13  v0
+# A1 x15  v1
+# B   x5  v4  v5  v6  v7
+# C0  x6 v16 v18 v20 v22 v24 v26 v28 v30
+# C1  x7 v17 v19 v21 v23 v25 v27 v29 v31
+# temp0   v2 v10 v12 v14
+# temp1   v3 v11 v13 v15
+# unused  v8 v9
+
+BEGIN_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal
+
+        # Clamp C pointers
+        LDP     x10, x8, [sp]           // Load cn_stride, a_offset
+        CMP     x0, 2                   // if mr < 2
+        LDP     x12, x11, [sp, 16]      // Load zero, params pointer
+        ADD     x7, x6, x7              // c1 = c0 + cm_stride
+        STP     d10, d11, [sp, -48]!
+        ADD     x2, x2, 15              // kc = (kc + 15) & ~15
+        STP     d12, d13, [sp, 16]
+        CSEL    x7, x6, x7, LO          //   c1 = c0
+        STP     d14, d15, [sp, 32]
+        BIC     x2, x2, 15
+
+        .p2align 3
+0:
+        # Load initial bias from w into accumulators
+        LDP     s16, s18, [x5], 8
+        MOV     v17.16b, v16.16b
+        MOV     v19.16b, v18.16b
+        LDP     s20, s22, [x5], 8
+        MOV     v21.16b, v20.16b
+        MOV     v23.16b, v22.16b
+        LDP     s24, s26, [x5], 8
+        MOV     v25.16b, v24.16b
+        MOV     v27.16b, v26.16b
+        LDP     s28, s30, [x5], 8
+        MOV     v29.16b, v28.16b
+        MOV     v31.16b, v30.16b
+        MOV     x9, x3                  // p = ks
+
+        .p2align 3
+1:
+        # Load next 2 A pointers
+        LDP     x13, x15, [x4], 16
+
+        CMP     x13, x12                // if a0 == zero
+        ADD     x13, x13, x8            // a0 += a_offset
+        CSEL    x13, x12, x13, EQ       //   a0 = zero, else += a0 + a_offset
+        CMP     x15, x12                // if a1 == zero
+        ADD     x15, x15, x8            // a1 += a_offset
+        CSEL    x15, x12, x15, EQ       //   a1 = zero, else += a1 + a_offset
+
+        MOV     x0, x2                  // k = kc
+
+        # Main loop - 16 bytes of A
+        .p2align 3
+2:
+        LDR     q0, [x13], 16
+        LDP     q4, q5, [x5]
+        LDR     q1, [x15], 16
+        LDP     q6, q7, [x5, 32]
+        SMULL   v2.8h, v4.8b, v0.8b
+        SMULL   v3.8h, v4.8b, v1.8b
+        SMULL   v10.8h, v5.8b, v0.8b
+        SMULL   v11.8h, v5.8b, v1.8b
+        SMLAL2  v2.8h, v4.16b, v0.16b
+        SMLAL2  v3.8h, v4.16b, v1.16b
+        SMLAL2  v10.8h, v5.16b, v0.16b
+        SMLAL2  v11.8h, v5.16b, v1.16b
+        SMULL   v12.8h, v6.8b, v0.8b
+        SADALP  v16.4s,  v2.8h
+        SMULL   v13.8h, v6.8b, v1.8b
+        SADALP  v17.4s,  v3.8h
+        SMULL   v14.8h, v7.8b, v0.8b
+        SADALP  v18.4s, v10.8h
+        SMULL   v15.8h, v7.8b, v1.8b
+        SADALP  v19.4s, v11.8h
+        LDP     q4, q5, [x5, 64]
+        SMLAL2  v12.8h, v6.16b, v0.16b
+        SMLAL2  v13.8h, v6.16b, v1.16b
+        SMLAL2  v14.8h, v7.16b, v0.16b
+        SMLAL2  v15.8h, v7.16b, v1.16b
+        SMULL   v2.8h, v4.8b, v0.8b
+        SADALP  v20.4s, v12.8h
+        SMULL   v3.8h, v4.8b, v1.8b
+        SADALP  v21.4s, v13.8h
+        SMULL   v10.8h, v5.8b, v0.8b
+        SADALP  v22.4s, v14.8h
+        SMULL   v11.8h, v5.8b, v1.8b
+        SADALP  v23.4s, v15.8h
+        LDP     q6, q7, [x5, 96]
+
+        SMLAL2  v2.8h, v4.16b, v0.16b
+        SMLAL2  v3.8h, v4.16b, v1.16b
+        SMLAL2  v10.8h, v5.16b, v0.16b
+        SMLAL2  v11.8h, v5.16b, v1.16b
+        ADD     x5, x5, 128
+        SMULL   v12.8h, v6.8b, v0.8b
+        SADALP  v24.4s,  v2.8h
+        SMULL   v13.8h, v6.8b, v1.8b
+        SADALP  v25.4s,  v3.8h
+        SMULL   v14.8h, v7.8b, v0.8b
+        SADALP  v26.4s, v10.8h
+        SMULL   v15.8h, v7.8b, v1.8b
+        SADALP  v27.4s, v11.8h
+        SUBS    x0, x0, 16
+        SMLAL2  v12.8h, v6.16b, v0.16b
+        SMLAL2  v13.8h, v6.16b, v1.16b
+        SMLAL2  v14.8h, v7.16b, v0.16b
+        SMLAL2  v15.8h, v7.16b, v1.16b
+        SADALP  v28.4s, v12.8h
+        SADALP  v29.4s, v13.8h
+        SADALP  v30.4s, v14.8h
+        SADALP  v31.4s, v15.8h
+        B.HI    2b
+
+        # ks loop
+        SUBS    x9, x9, 16              // ks -= MR * sizeof(int8_t*)
+        B.HI    1b
+
+        # Add columns
+        ADDP    v16.4s, v16.4s, v18.4s
+        ADDP    v20.4s, v20.4s, v22.4s
+        ADDP    v24.4s, v24.4s, v26.4s
+        ADDP    v28.4s, v28.4s, v30.4s
+        ADDP    v17.4s, v17.4s, v19.4s
+        ADDP    v21.4s, v21.4s, v23.4s
+        ADDP    v25.4s, v25.4s, v27.4s
+        ADDP    v29.4s, v29.4s, v31.4s
+        ADDP    v0.4s, v16.4s, v20.4s
+        ADDP    v1.4s, v24.4s, v28.4s
+        ADDP    v2.4s, v17.4s, v21.4s
+        ADDP    v3.4s, v25.4s, v29.4s
+
+        # Load per channel scale values from weights
+        SCVTF   v0.4s, v0.4s
+        LDR     q4, [x5], 16
+        SCVTF   v1.4s, v1.4s
+        LDR     q5, [x5], 16
+        SCVTF   v2.4s, v2.4s
+        SCVTF   v3.4s, v3.4s
+        FMUL    v0.4s, v0.4s, v4.4s
+        FMUL    v1.4s, v1.4s, v5.4s
+        FMUL    v2.4s, v2.4s, v4.4s
+        FMUL    v3.4s, v3.4s, v5.4s
+
+        FCVTNS  v0.4s, v0.4s
+        FCVTNS  v1.4s, v1.4s
+        FCVTNS  v2.4s, v2.4s
+        FCVTNS  v3.4s, v3.4s
+
+        LD1R    {v5.8h}, [x11], 2
+        SQXTN   v0.4h, v0.4s
+        SQXTN   v2.4h, v2.4s
+        SQXTN2  v0.8h, v1.4s
+        SQXTN2  v2.8h, v3.4s
+        SUBS    x1, x1, 8
+        SQADD   v0.8h, v0.8h, v5.8h
+        SQADD   v1.8h, v2.8h, v5.8h
+        SQXTN   v0.8b, v0.8h
+        SQXTN2  v0.16b, v1.8h
+        LD1R    {v1.16b}, [x11], 1
+        LD1R    {v2.16b}, [x11]
+        SMAX    v0.16b, v0.16b, v1.16b
+        SUB     x11, x11, 3               // rewind params pointer
+        SMIN    v0.16b, v0.16b, v2.16b
+        B.LO    3f
+
+        # Store full 2 x 8
+        ST1     {v0.d}[1], [x7], x10
+        SUB     x4, x4, x3              // a -= ks
+        ST1     {v0.8b}, [x6], x10
+
+        # nc loop
+        B.HI    0b
+
+        # Restore d10-d15 from stack
+        LDP     d14, d15, [sp, 32]
+        LDP     d12, d13, [sp, 16]
+        LDP     d10, d11, [sp], 48
+        RET
+
+        # Store odd width
+        .p2align 3
+3:
+        TBZ     x1, 2, 4f
+        ST1     {v0.s}[2], [x7], 4
+        STR     s0, [x6], 4
+        EXT     v0.16b, v0.16b, v0.16b, 4
+
+4:
+        TBZ     x1, 1, 5f
+        ST1     {v0.h}[4], [x7], 2
+        ST1     {v0.h}[0], [x6], 2
+        EXT     v0.16b, v0.16b, v0.16b, 2
+5:
+        TBZ     x1, 0, 6f
+        ST1     {v0.b}[8], [x7]
+        ST1     {v0.b}[0], [x6]
+6:
+        # Restore d10-d15 from stack
+        LDP     d14, d15, [sp, 32]
+        LDP     d12, d13, [sp, 16]
+        LDP     d10, d11, [sp], 48
+        RET
+
+END_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
+
diff --git a/src/qs8-gemm/2x8c16-aarch64-neon-mlal-padal.S.in b/src/qs8-gemm/2x8c16-aarch64-neon-mlal-padal.S.in
index d021f8b..7f3cf72 100644
--- a/src/qs8-gemm/2x8c16-aarch64-neon-mlal-padal.S.in
+++ b/src/qs8-gemm/2x8c16-aarch64-neon-mlal-padal.S.in
@@ -3,9 +3,14 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
+$assert REQUANTIZATION in ["GEMMLOWP", "FP32"]
+$assert not CHANNELWISE or REQUANTIZATION == "FP32"
+
 #include <xnnpack/assembly.h>
 
-# void xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal(
+$DATATYPE = "qc8" if CHANNELWISE else "qs8"
+$CONV_PARAMS = "xnn_qs8_minmax_params" if CHANNELWISE else "xnn_qs8_conv_minmax_params"
+# void xnn_${DATATYPE}_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_2x8c16__aarch64_neon_mlal_padal(
 #     size_t mr,                 x0
 #     size_t nc,                 x1
 #     size_t kc,                 x2 / x0
@@ -15,7 +20,7 @@
 #     int8_t* restrict c,        x6
 #     size_t cm_stride,          x7
 #     size_t cn_stride,          [sp] -> x10
-#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x11
+#     const union ${CONV_PARAMS} params)  [sp + 8] -> x11
 
 # d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
 
@@ -29,7 +34,7 @@
 # temp1   v3 v11 v13 v15
 # unused  v8 v9
 
-BEGIN_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal
+BEGIN_FUNCTION xnn_${DATATYPE}_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_2x8c16__aarch64_neon_mlal_padal
 
         # Clamp A and C pointers
         CMP     x0, 2                   // if mr < 2
@@ -126,10 +131,12 @@
         # Add columns
         ADDP    v16.4s, v16.4s, v18.4s
         ADDP    v20.4s, v20.4s, v22.4s
-        LD1R    {v4.4s}, [x11], 4
+        $if REQUANTIZATION == "GEMMLOWP":
+          LD1R    {v4.4s}, [x11], 4
         ADDP    v24.4s, v24.4s, v26.4s
         ADDP    v28.4s, v28.4s, v30.4s
-        LD1R    {v7.4s}, [x11], 4
+        $if REQUANTIZATION == "GEMMLOWP":
+          LD1R    {v7.4s}, [x11], 4
         ADDP    v17.4s, v17.4s, v19.4s
         ADDP    v21.4s, v21.4s, v23.4s
         ADDP    v25.4s, v25.4s, v27.4s
@@ -139,25 +146,56 @@
         ADDP    v2.4s, v17.4s, v21.4s
         ADDP    v3.4s, v25.4s, v29.4s
 
-        # Apply params - scale, shift, bias and clamp
-        SQRDMULH v0.4s, v0.4s, v4.4s
-        SQRDMULH v1.4s, v1.4s, v4.4s
-        SQRDMULH v2.4s, v2.4s, v4.4s
-        SQRDMULH v3.4s, v3.4s, v4.4s
-        CMEQ    v4.4s, v7.4s, 0
+        $if REQUANTIZATION == "GEMMLOWP":
+          # Apply params - scale, shift, bias and clamp
+          SQRDMULH v0.4s, v0.4s, v4.4s
+          SQRDMULH v1.4s, v1.4s, v4.4s
+          SQRDMULH v2.4s, v2.4s, v4.4s
+          SQRDMULH v3.4s, v3.4s, v4.4s
+          CMEQ    v4.4s, v7.4s, 0
+          BIC     v6.16b, v0.16b, v4.16b
+          BIC     v16.16b, v1.16b, v4.16b
+          BIC     v17.16b, v2.16b, v4.16b
+          BIC     v4.16b,  v3.16b, v4.16b
+          SSRA    v0.4s,  v6.4s, 31
+          SSRA    v1.4s, v16.4s, 31
+          SSRA    v2.4s, v17.4s, 31
+          SSRA    v3.4s,  v4.4s, 31
+          SRSHL   v0.4s, v0.4s, v7.4s
+          SRSHL   v1.4s, v1.4s, v7.4s
+          SRSHL   v2.4s, v2.4s, v7.4s
+          SRSHL   v3.4s, v3.4s, v7.4s
+        $elif REQUANTIZATION == "FP32":
+          $if not CHANNELWISE:
+            # Apply params - scale, bias and clamp
+            SCVTF   v0.4s, v0.4s
+            LD1R    {v4.4s}, [x11], 4
+            SCVTF   v1.4s, v1.4s
+            SCVTF   v2.4s, v2.4s
+            SCVTF   v3.4s, v3.4s
+            FMUL    v0.4s, v0.4s, v4.4s
+            FMUL    v1.4s, v1.4s, v4.4s
+            FMUL    v2.4s, v2.4s, v4.4s
+            FMUL    v3.4s, v3.4s, v4.4s
+          $else:
+            # Load per channel scale values from weights
+            SCVTF   v0.4s, v0.4s
+            LDR     q4, [x5], 16
+            SCVTF   v1.4s, v1.4s
+            LDR     q5, [x5], 16
+            SCVTF   v2.4s, v2.4s
+            SCVTF   v3.4s, v3.4s
+            FMUL    v0.4s, v0.4s, v4.4s
+            FMUL    v1.4s, v1.4s, v5.4s
+            FMUL    v2.4s, v2.4s, v4.4s
+            FMUL    v3.4s, v3.4s, v5.4s
+
+          FCVTNS  v0.4s, v0.4s
+          FCVTNS  v1.4s, v1.4s
+          FCVTNS  v2.4s, v2.4s
+          FCVTNS  v3.4s, v3.4s
+
         LD1R    {v5.8h}, [x11], 2
-        BIC     v6.16b, v0.16b, v4.16b
-        BIC     v16.16b, v1.16b, v4.16b
-        BIC     v17.16b, v2.16b, v4.16b
-        BIC     v4.16b,  v3.16b, v4.16b
-        SSRA    v0.4s,  v6.4s, 31
-        SSRA    v1.4s, v16.4s, 31
-        SSRA    v2.4s, v17.4s, 31
-        SSRA    v3.4s,  v4.4s, 31
-        SRSHL   v0.4s, v0.4s, v7.4s
-        SRSHL   v1.4s, v1.4s, v7.4s
-        SRSHL   v2.4s, v2.4s, v7.4s
-        SRSHL   v3.4s, v3.4s, v7.4s
         SQXTN   v0.4h, v0.4s
         SQXTN   v2.4h, v2.4s
         SQXTN2  v0.8h, v1.4s
@@ -210,7 +248,7 @@
         LDP     d10, d11, [sp], 48
         RET
 
-END_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal
+END_FUNCTION xnn_${DATATYPE}_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_2x8c16__aarch64_neon_mlal_padal
 
 #ifdef __ELF__
 .section ".note.GNU-stack","",%progbits
diff --git a/src/qs8-gemm/gen/2x8c16-minmax-fp32-aarch64-neon-mlal-padal.S b/src/qs8-gemm/gen/2x8c16-minmax-fp32-aarch64-neon-mlal-padal.S
new file mode 100644
index 0000000..d0eb12f
--- /dev/null
+++ b/src/qs8-gemm/gen/2x8c16-minmax-fp32-aarch64-neon-mlal-padal.S
@@ -0,0 +1,219 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/2x8c16-aarch64-neon-mlal-padal.S.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+
+#include <xnnpack/assembly.h>
+
+# void xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal(
+#     size_t mr,                 x0
+#     size_t nc,                 x1
+#     size_t kc,                 x2 / x0
+#     const int8_t* restrict a,  x3
+#     size_t a_stride,           x4
+#     const void* restrict w,    x5
+#     int8_t* restrict c,        x6
+#     size_t cm_stride,          x7
+#     size_t cn_stride,          [sp] -> x10
+#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x11
+
+# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+# Register usage
+# A0  x3  v0
+# A1  x4  v1
+# B   x5  v4  v5  v6  v7
+# C0  x7 v16 v18 v20 v22 v24 v26 v28 v30
+# C1  x8 v17 v19 v21 v23 v25 v27 v29 v31
+# temp0   v2 v10 v12 v14
+# temp1   v3 v11 v13 v15
+# unused  v8 v9
+
+BEGIN_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal
+
+        # Clamp A and C pointers
+        CMP     x0, 2                   // if mr < 2
+        STP     d10, d11, [sp, -48]!
+        ADD     x4, x3, x4              // a1 = a0 + a_stride
+        STP     d12, d13, [sp, 16]
+        ADD     x7, x6, x7              // c1 = c0 + cm_stride
+        STP     d14, d15, [sp, 32]
+        CSEL    x4, x3, x4, LO          //   a1 = a0
+        ADD     x2, x2, 15              // kc = (kc + 15) & ~15
+        CSEL    x7, x6, x7, LO          //   c1 = c0
+        BIC     x2, x2, 15
+
+        .p2align 3
+0:
+        # Load initial bias from w into accumulators
+        MOV     x0, x2                  // k = kc
+        LDP     s16, s18, [x5], 8
+        MOV     v17.16b, v16.16b
+        MOV     v19.16b, v18.16b
+        LDP     s20, s22, [x5], 8
+        MOV     v21.16b, v20.16b
+        MOV     v23.16b, v22.16b
+        LDP     s24, s26, [x5], 8
+        MOV     v25.16b, v24.16b
+        MOV     v27.16b, v26.16b
+        LDP     s28, s30, [x5], 8
+        MOV     v29.16b, v28.16b
+        LDP     x10, x11, [sp, 48]       // cn_stride, params
+        MOV     v31.16b, v30.16b
+
+        # Main loop - 16 bytes of A
+        .p2align 3
+1:
+        LDR     q0, [x3], 16
+        LDP     q4, q5, [x5]
+        LDR     q1, [x4], 16
+        LDP     q6, q7, [x5, 32]
+        SMULL   v2.8h, v4.8b, v0.8b
+        SMULL   v3.8h, v4.8b, v1.8b
+        SMULL   v10.8h, v5.8b, v0.8b
+        SMULL   v11.8h, v5.8b, v1.8b
+        SMLAL2  v2.8h, v4.16b, v0.16b
+        SMLAL2  v3.8h, v4.16b, v1.16b
+        SMLAL2  v10.8h, v5.16b, v0.16b
+        SMLAL2  v11.8h, v5.16b, v1.16b
+        SMULL   v12.8h, v6.8b, v0.8b
+        SADALP  v16.4s,  v2.8h
+        SMULL   v13.8h, v6.8b, v1.8b
+        SADALP  v17.4s,  v3.8h
+        SMULL   v14.8h, v7.8b, v0.8b
+        SADALP  v18.4s, v10.8h
+        SMULL   v15.8h, v7.8b, v1.8b
+        SADALP  v19.4s, v11.8h
+        LDP     q4, q5, [x5, 64]
+        SMLAL2  v12.8h, v6.16b, v0.16b
+        SMLAL2  v13.8h, v6.16b, v1.16b
+        SMLAL2  v14.8h, v7.16b, v0.16b
+        SMLAL2  v15.8h, v7.16b, v1.16b
+        SMULL   v2.8h, v4.8b, v0.8b
+        SADALP  v20.4s, v12.8h
+        SMULL   v3.8h, v4.8b, v1.8b
+        SADALP  v21.4s, v13.8h
+        SMULL   v10.8h, v5.8b, v0.8b
+        SADALP  v22.4s, v14.8h
+        SMULL   v11.8h, v5.8b, v1.8b
+        SADALP  v23.4s, v15.8h
+        LDP     q6, q7, [x5, 96]
+
+        SMLAL2  v2.8h, v4.16b, v0.16b
+        SMLAL2  v3.8h, v4.16b, v1.16b
+        SMLAL2  v10.8h, v5.16b, v0.16b
+        SMLAL2  v11.8h, v5.16b, v1.16b
+        ADD     x5, x5, 128
+        SMULL   v12.8h, v6.8b, v0.8b
+        SADALP  v24.4s,  v2.8h
+        SMULL   v13.8h, v6.8b, v1.8b
+        SADALP  v25.4s,  v3.8h
+        SMULL   v14.8h, v7.8b, v0.8b
+        SADALP  v26.4s, v10.8h
+        SMULL   v15.8h, v7.8b, v1.8b
+        SADALP  v27.4s, v11.8h
+        SUBS    x0, x0, 16
+        SMLAL2  v12.8h, v6.16b, v0.16b
+        SMLAL2  v13.8h, v6.16b, v1.16b
+        SMLAL2  v14.8h, v7.16b, v0.16b
+        SMLAL2  v15.8h, v7.16b, v1.16b
+        SADALP  v28.4s, v12.8h
+        SADALP  v29.4s, v13.8h
+        SADALP  v30.4s, v14.8h
+        SADALP  v31.4s, v15.8h
+        B.HI    1b
+
+        # Add columns
+        ADDP    v16.4s, v16.4s, v18.4s
+        ADDP    v20.4s, v20.4s, v22.4s
+        ADDP    v24.4s, v24.4s, v26.4s
+        ADDP    v28.4s, v28.4s, v30.4s
+        ADDP    v17.4s, v17.4s, v19.4s
+        ADDP    v21.4s, v21.4s, v23.4s
+        ADDP    v25.4s, v25.4s, v27.4s
+        ADDP    v29.4s, v29.4s, v31.4s
+        ADDP    v0.4s, v16.4s, v20.4s
+        ADDP    v1.4s, v24.4s, v28.4s
+        ADDP    v2.4s, v17.4s, v21.4s
+        ADDP    v3.4s, v25.4s, v29.4s
+
+        # Apply params - scale, bias and clamp
+        SCVTF   v0.4s, v0.4s
+        LD1R    {v4.4s}, [x11], 4
+        SCVTF   v1.4s, v1.4s
+        SCVTF   v2.4s, v2.4s
+        SCVTF   v3.4s, v3.4s
+        FMUL    v0.4s, v0.4s, v4.4s
+        FMUL    v1.4s, v1.4s, v4.4s
+        FMUL    v2.4s, v2.4s, v4.4s
+        FMUL    v3.4s, v3.4s, v4.4s
+
+        FCVTNS  v0.4s, v0.4s
+        FCVTNS  v1.4s, v1.4s
+        FCVTNS  v2.4s, v2.4s
+        FCVTNS  v3.4s, v3.4s
+
+        LD1R    {v5.8h}, [x11], 2
+        SQXTN   v0.4h, v0.4s
+        SQXTN   v2.4h, v2.4s
+        SQXTN2  v0.8h, v1.4s
+        SQXTN2  v2.8h, v3.4s
+        SUBS    x1, x1, 8
+        SQADD   v0.8h, v0.8h, v5.8h
+        SQADD   v1.8h, v2.8h, v5.8h
+        SQXTN   v0.8b, v0.8h
+        SQXTN2  v0.16b, v1.8h
+        LD1R    {v1.16b}, [x11], 1
+        LD1R    {v2.16b}, [x11]
+        SMAX    v0.16b, v0.16b, v1.16b
+        SMIN    v0.16b, v0.16b, v2.16b
+        B.LO    2f
+
+        # Store full 2 x 8
+        ST1     {v0.8b}, [x6], x10
+        SUB     x3, x3, x2              // a0 -= kc
+        ST1     {v0.d}[1], [x7], x10
+        SUB     x4, x4, x2              // a1 -= kc
+        B.HI    0b
+
+        # Restore d10-d15 from stack
+        LDP     d14, d15, [sp, 32]
+        LDP     d12, d13, [sp, 16]
+        LDP     d10, d11, [sp], 48
+        RET
+
+        # Store odd width
+        .p2align 3
+2:
+        TBZ     x1, 2, 3f
+        STR     s0, [x6], 4
+        ST1     {v0.s}[2], [x7], 4
+        EXT     v0.16b, v0.16b, v0.16b, 4
+
+3:
+        TBZ     x1, 1, 4f
+        ST1     {v0.h}[0], [x6], 2
+        ST1     {v0.h}[4], [x7], 2
+        EXT     v0.16b, v0.16b, v0.16b, 2
+4:
+        TBZ     x1, 0, 5f
+        ST1     {v0.b}[0], [x6]
+        ST1     {v0.b}[8], [x7]
+5:
+        # Restore d10-d15 from stack
+        LDP     d14, d15, [sp, 32]
+        LDP     d12, d13, [sp, 16]
+        LDP     d10, d11, [sp], 48
+        RET
+
+END_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
+
diff --git a/src/qs8-gemm/gen/2x8c16-minmax-gemmlowp-aarch64-neon-mlal-padal.S b/src/qs8-gemm/gen/2x8c16-minmax-gemmlowp-aarch64-neon-mlal-padal.S
index 670bc2f..b77500e 100644
--- a/src/qs8-gemm/gen/2x8c16-minmax-gemmlowp-aarch64-neon-mlal-padal.S
+++ b/src/qs8-gemm/gen/2x8c16-minmax-gemmlowp-aarch64-neon-mlal-padal.S
@@ -7,6 +7,7 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
+
 #include <xnnpack/assembly.h>
 
 # void xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal(
@@ -149,7 +150,6 @@
         SQRDMULH v2.4s, v2.4s, v4.4s
         SQRDMULH v3.4s, v3.4s, v4.4s
         CMEQ    v4.4s, v7.4s, 0
-        LD1R    {v5.8h}, [x11], 2
         BIC     v6.16b, v0.16b, v4.16b
         BIC     v16.16b, v1.16b, v4.16b
         BIC     v17.16b, v2.16b, v4.16b
@@ -162,6 +162,8 @@
         SRSHL   v1.4s, v1.4s, v7.4s
         SRSHL   v2.4s, v2.4s, v7.4s
         SRSHL   v3.4s, v3.4s, v7.4s
+
+        LD1R    {v5.8h}, [x11], 2
         SQXTN   v0.4h, v0.4s
         SQXTN   v2.4h, v2.4s
         SQXTN2  v0.8h, v1.4s
diff --git a/src/qs8-igemm/2x8c16-aarch64-neon-mlal-padal.S.in b/src/qs8-igemm/2x8c16-aarch64-neon-mlal-padal.S.in
index ad9ea6f..bb30160 100644
--- a/src/qs8-igemm/2x8c16-aarch64-neon-mlal-padal.S.in
+++ b/src/qs8-igemm/2x8c16-aarch64-neon-mlal-padal.S.in
@@ -3,9 +3,14 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
+$assert REQUANTIZATION in ["GEMMLOWP", "FP32"]
+$assert not CHANNELWISE or REQUANTIZATION == "FP32"
+
 #include <xnnpack/assembly.h>
 
-# void xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal(
+$DATATYPE = "qc8" if CHANNELWISE else "qs8"
+$CONV_PARAMS = "xnn_qs8_minmax_params" if CHANNELWISE else "xnn_qs8_conv_minmax_params"
+# void xnn_${DATATYPE}_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_2x8c16__aarch64_neon_mlal_padal(
 #     size_t mr,                 x0
 #     size_t nc,                 x1
 #     size_t kc,                 x2 / x0
@@ -17,7 +22,7 @@
 #     size_t cn_stride,                  [sp] -> x10
 #     size_t a_offset,                   [sp + 8] -> x8
 #     const float* zero,                 [sp + 16] -> x12
-#     const xnn_qs8_conv_minmax_params params [sp + 24] -> x11
+#     const union ${CONV_PARAMS} params [sp + 24] -> x11
 
 # d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
 
@@ -31,7 +36,7 @@
 # temp1   v3 v11 v13 v15
 # unused  v8 v9
 
-BEGIN_FUNCTION xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal
+BEGIN_FUNCTION xnn_${DATATYPE}_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_2x8c16__aarch64_neon_mlal_padal
 
         # Clamp C pointers
         LDP     x10, x8, [sp]           // Load cn_stride, a_offset
@@ -145,10 +150,12 @@
         # Add columns
         ADDP    v16.4s, v16.4s, v18.4s
         ADDP    v20.4s, v20.4s, v22.4s
-        LD1R    {v4.4s}, [x11], 4
+        $if REQUANTIZATION == "GEMMLOWP":
+          LD1R    {v4.4s}, [x11], 4
         ADDP    v24.4s, v24.4s, v26.4s
         ADDP    v28.4s, v28.4s, v30.4s
-        LD1R    {v7.4s}, [x11], 4
+        $if REQUANTIZATION == "GEMMLOWP":
+          LD1R    {v7.4s}, [x11], 4
         ADDP    v17.4s, v17.4s, v19.4s
         ADDP    v21.4s, v21.4s, v23.4s
         ADDP    v25.4s, v25.4s, v27.4s
@@ -158,25 +165,56 @@
         ADDP    v2.4s, v17.4s, v21.4s
         ADDP    v3.4s, v25.4s, v29.4s
 
-        # Apply params - scale, shift, bias and clamp
-        SQRDMULH v0.4s, v0.4s, v4.4s
-        SQRDMULH v1.4s, v1.4s, v4.4s
-        SQRDMULH v2.4s, v2.4s, v4.4s
-        SQRDMULH v3.4s, v3.4s, v4.4s
-        CMEQ    v4.4s, v7.4s, 0
+        $if REQUANTIZATION == "GEMMLOWP":
+          # Apply params - scale, shift, bias and clamp
+          SQRDMULH v0.4s, v0.4s, v4.4s
+          SQRDMULH v1.4s, v1.4s, v4.4s
+          SQRDMULH v2.4s, v2.4s, v4.4s
+          SQRDMULH v3.4s, v3.4s, v4.4s
+          CMEQ    v4.4s, v7.4s, 0
+          BIC     v6.16b, v0.16b, v4.16b
+          BIC     v16.16b, v1.16b, v4.16b
+          BIC     v17.16b, v2.16b, v4.16b
+          BIC     v4.16b,  v3.16b, v4.16b
+          SSRA    v0.4s,  v6.4s, 31
+          SSRA    v1.4s, v16.4s, 31
+          SSRA    v2.4s, v17.4s, 31
+          SSRA    v3.4s,  v4.4s, 31
+          SRSHL   v0.4s, v0.4s, v7.4s
+          SRSHL   v1.4s, v1.4s, v7.4s
+          SRSHL   v2.4s, v2.4s, v7.4s
+          SRSHL   v3.4s, v3.4s, v7.4s
+        $elif REQUANTIZATION == "FP32":
+          $if not CHANNELWISE:
+            # Apply params - scale, bias and clamp
+            SCVTF   v0.4s, v0.4s
+            LD1R    {v4.4s}, [x11], 4
+            SCVTF   v1.4s, v1.4s
+            SCVTF   v2.4s, v2.4s
+            SCVTF   v3.4s, v3.4s
+            FMUL    v0.4s, v0.4s, v4.4s
+            FMUL    v1.4s, v1.4s, v4.4s
+            FMUL    v2.4s, v2.4s, v4.4s
+            FMUL    v3.4s, v3.4s, v4.4s
+          $else:
+            # Load per channel scale values from weights
+            SCVTF   v0.4s, v0.4s
+            LDR     q4, [x5], 16
+            SCVTF   v1.4s, v1.4s
+            LDR     q5, [x5], 16
+            SCVTF   v2.4s, v2.4s
+            SCVTF   v3.4s, v3.4s
+            FMUL    v0.4s, v0.4s, v4.4s
+            FMUL    v1.4s, v1.4s, v5.4s
+            FMUL    v2.4s, v2.4s, v4.4s
+            FMUL    v3.4s, v3.4s, v5.4s
+
+          FCVTNS  v0.4s, v0.4s
+          FCVTNS  v1.4s, v1.4s
+          FCVTNS  v2.4s, v2.4s
+          FCVTNS  v3.4s, v3.4s
+
         LD1R    {v5.8h}, [x11], 2
-        BIC     v6.16b, v0.16b, v4.16b
-        BIC     v16.16b, v1.16b, v4.16b
-        BIC     v17.16b, v2.16b, v4.16b
-        BIC     v4.16b,  v3.16b, v4.16b
-        SSRA    v0.4s,  v6.4s, 31
-        SSRA    v1.4s, v16.4s, 31
-        SSRA    v2.4s, v17.4s, 31
-        SSRA    v3.4s,  v4.4s, 31
-        SRSHL   v0.4s, v0.4s, v7.4s
-        SRSHL   v1.4s, v1.4s, v7.4s
-        SRSHL   v2.4s, v2.4s, v7.4s
-        SRSHL   v3.4s, v3.4s, v7.4s
         SQXTN   v0.4h, v0.4s
         SQXTN   v2.4h, v2.4s
         SQXTN2  v0.8h, v1.4s
@@ -189,7 +227,13 @@
         LD1R    {v1.16b}, [x11], 1
         LD1R    {v2.16b}, [x11]
         SMAX    v0.16b, v0.16b, v1.16b
-        SUB     x11, x11, 11              // rewind params pointer
+        $if REQUANTIZATION == "GEMMLOWP":
+          SUB     x11, x11, 11              // rewind params pointer
+        $elif REQUANTIZATION == "FP32":
+          $if CHANNELWISE:
+            SUB     x11, x11, 3               // rewind params pointer
+          $else:
+            SUB     x11, x11, 7               // rewind params pointer
         SMIN    v0.16b, v0.16b, v2.16b
         B.LO    3f
 
@@ -231,7 +275,7 @@
         LDP     d10, d11, [sp], 48
         RET
 
-END_FUNCTION xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal
+END_FUNCTION xnn_${DATATYPE}_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_2x8c16__aarch64_neon_mlal_padal
 
 #ifdef __ELF__
 .section ".note.GNU-stack","",%progbits
diff --git a/src/qs8-igemm/gen/2x8c16-minmax-fp32-aarch64-neon-mlal-padal.S b/src/qs8-igemm/gen/2x8c16-minmax-fp32-aarch64-neon-mlal-padal.S
new file mode 100644
index 0000000..33057f9
--- /dev/null
+++ b/src/qs8-igemm/gen/2x8c16-minmax-fp32-aarch64-neon-mlal-padal.S
@@ -0,0 +1,240 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/2x8c16-aarch64-neon-mlal-padal.S.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+
+#include <xnnpack/assembly.h>
+
+# void xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal(
+#     size_t mr,                 x0
+#     size_t nc,                 x1
+#     size_t kc,                 x2 / x0
+#     size_t ks,                 x3 / x9
+#     const int8_t**restrict a,  x4
+#     const int8_t* restrict w,  x5
+#     int8_t* restrict c,        x6
+#     size_t cm_stride,          x7
+#     size_t cn_stride,                  [sp] -> x10
+#     size_t a_offset,                   [sp + 8] -> x8
+#     const float* zero,                 [sp + 16] -> x12
+#     const union xnn_qs8_conv_minmax_params params [sp + 24] -> x11
+
+# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+# Register usage
+# A0 x13  v0
+# A1 x15  v1
+# B   x5  v4  v5  v6  v7
+# C0  x6 v16 v18 v20 v22 v24 v26 v28 v30
+# C1  x7 v17 v19 v21 v23 v25 v27 v29 v31
+# temp0   v2 v10 v12 v14
+# temp1   v3 v11 v13 v15
+# unused  v8 v9
+
+BEGIN_FUNCTION xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal
+
+        # Clamp C pointers
+        LDP     x10, x8, [sp]           // Load cn_stride, a_offset
+        CMP     x0, 2                   // if mr < 2
+        LDP     x12, x11, [sp, 16]      // Load zero, params pointer
+        ADD     x7, x6, x7              // c1 = c0 + cm_stride
+        STP     d10, d11, [sp, -48]!
+        ADD     x2, x2, 15              // kc = (kc + 15) & ~15
+        STP     d12, d13, [sp, 16]
+        CSEL    x7, x6, x7, LO          //   c1 = c0
+        STP     d14, d15, [sp, 32]
+        BIC     x2, x2, 15
+
+        .p2align 3
+0:
+        # Load initial bias from w into accumulators
+        LDP     s16, s18, [x5], 8
+        MOV     v17.16b, v16.16b
+        MOV     v19.16b, v18.16b
+        LDP     s20, s22, [x5], 8
+        MOV     v21.16b, v20.16b
+        MOV     v23.16b, v22.16b
+        LDP     s24, s26, [x5], 8
+        MOV     v25.16b, v24.16b
+        MOV     v27.16b, v26.16b
+        LDP     s28, s30, [x5], 8
+        MOV     v29.16b, v28.16b
+        MOV     v31.16b, v30.16b
+        MOV     x9, x3                  // p = ks
+
+        .p2align 3
+1:
+        # Load next 2 A pointers
+        LDP     x13, x15, [x4], 16
+
+        CMP     x13, x12                // if a0 == zero
+        ADD     x13, x13, x8            // a0 += a_offset
+        CSEL    x13, x12, x13, EQ       //   a0 = zero, else += a0 + a_offset
+        CMP     x15, x12                // if a1 == zero
+        ADD     x15, x15, x8            // a1 += a_offset
+        CSEL    x15, x12, x15, EQ       //   a1 = zero, else += a1 + a_offset
+
+        MOV     x0, x2                  // k = kc
+
+        # Main loop - 16 bytes of A
+        .p2align 3
+2:
+        LDR     q0, [x13], 16
+        LDP     q4, q5, [x5]
+        LDR     q1, [x15], 16
+        LDP     q6, q7, [x5, 32]
+        SMULL   v2.8h, v4.8b, v0.8b
+        SMULL   v3.8h, v4.8b, v1.8b
+        SMULL   v10.8h, v5.8b, v0.8b
+        SMULL   v11.8h, v5.8b, v1.8b
+        SMLAL2  v2.8h, v4.16b, v0.16b
+        SMLAL2  v3.8h, v4.16b, v1.16b
+        SMLAL2  v10.8h, v5.16b, v0.16b
+        SMLAL2  v11.8h, v5.16b, v1.16b
+        SMULL   v12.8h, v6.8b, v0.8b
+        SADALP  v16.4s,  v2.8h
+        SMULL   v13.8h, v6.8b, v1.8b
+        SADALP  v17.4s,  v3.8h
+        SMULL   v14.8h, v7.8b, v0.8b
+        SADALP  v18.4s, v10.8h
+        SMULL   v15.8h, v7.8b, v1.8b
+        SADALP  v19.4s, v11.8h
+        LDP     q4, q5, [x5, 64]
+        SMLAL2  v12.8h, v6.16b, v0.16b
+        SMLAL2  v13.8h, v6.16b, v1.16b
+        SMLAL2  v14.8h, v7.16b, v0.16b
+        SMLAL2  v15.8h, v7.16b, v1.16b
+        SMULL   v2.8h, v4.8b, v0.8b
+        SADALP  v20.4s, v12.8h
+        SMULL   v3.8h, v4.8b, v1.8b
+        SADALP  v21.4s, v13.8h
+        SMULL   v10.8h, v5.8b, v0.8b
+        SADALP  v22.4s, v14.8h
+        SMULL   v11.8h, v5.8b, v1.8b
+        SADALP  v23.4s, v15.8h
+        LDP     q6, q7, [x5, 96]
+
+        SMLAL2  v2.8h, v4.16b, v0.16b
+        SMLAL2  v3.8h, v4.16b, v1.16b
+        SMLAL2  v10.8h, v5.16b, v0.16b
+        SMLAL2  v11.8h, v5.16b, v1.16b
+        ADD     x5, x5, 128
+        SMULL   v12.8h, v6.8b, v0.8b
+        SADALP  v24.4s,  v2.8h
+        SMULL   v13.8h, v6.8b, v1.8b
+        SADALP  v25.4s,  v3.8h
+        SMULL   v14.8h, v7.8b, v0.8b
+        SADALP  v26.4s, v10.8h
+        SMULL   v15.8h, v7.8b, v1.8b
+        SADALP  v27.4s, v11.8h
+        SUBS    x0, x0, 16
+        SMLAL2  v12.8h, v6.16b, v0.16b
+        SMLAL2  v13.8h, v6.16b, v1.16b
+        SMLAL2  v14.8h, v7.16b, v0.16b
+        SMLAL2  v15.8h, v7.16b, v1.16b
+        SADALP  v28.4s, v12.8h
+        SADALP  v29.4s, v13.8h
+        SADALP  v30.4s, v14.8h
+        SADALP  v31.4s, v15.8h
+        B.HI    2b
+
+        # ks loop
+        SUBS    x9, x9, 16              // ks -= MR * sizeof(int8_t*)
+        B.HI    1b
+
+        # Add columns
+        ADDP    v16.4s, v16.4s, v18.4s
+        ADDP    v20.4s, v20.4s, v22.4s
+        ADDP    v24.4s, v24.4s, v26.4s
+        ADDP    v28.4s, v28.4s, v30.4s
+        ADDP    v17.4s, v17.4s, v19.4s
+        ADDP    v21.4s, v21.4s, v23.4s
+        ADDP    v25.4s, v25.4s, v27.4s
+        ADDP    v29.4s, v29.4s, v31.4s
+        ADDP    v0.4s, v16.4s, v20.4s
+        ADDP    v1.4s, v24.4s, v28.4s
+        ADDP    v2.4s, v17.4s, v21.4s
+        ADDP    v3.4s, v25.4s, v29.4s
+
+        # Apply params - scale, bias and clamp
+        SCVTF   v0.4s, v0.4s
+        LD1R    {v4.4s}, [x11], 4
+        SCVTF   v1.4s, v1.4s
+        SCVTF   v2.4s, v2.4s
+        SCVTF   v3.4s, v3.4s
+        FMUL    v0.4s, v0.4s, v4.4s
+        FMUL    v1.4s, v1.4s, v4.4s
+        FMUL    v2.4s, v2.4s, v4.4s
+        FMUL    v3.4s, v3.4s, v4.4s
+
+        FCVTNS  v0.4s, v0.4s
+        FCVTNS  v1.4s, v1.4s
+        FCVTNS  v2.4s, v2.4s
+        FCVTNS  v3.4s, v3.4s
+
+        LD1R    {v5.8h}, [x11], 2
+        SQXTN   v0.4h, v0.4s
+        SQXTN   v2.4h, v2.4s
+        SQXTN2  v0.8h, v1.4s
+        SQXTN2  v2.8h, v3.4s
+        SUBS    x1, x1, 8
+        SQADD   v0.8h, v0.8h, v5.8h
+        SQADD   v1.8h, v2.8h, v5.8h
+        SQXTN   v0.8b, v0.8h
+        SQXTN2  v0.16b, v1.8h
+        LD1R    {v1.16b}, [x11], 1
+        LD1R    {v2.16b}, [x11]
+        SMAX    v0.16b, v0.16b, v1.16b
+        SUB     x11, x11, 7               // rewind params pointer
+        SMIN    v0.16b, v0.16b, v2.16b
+        B.LO    3f
+
+        # Store full 2 x 8
+        ST1     {v0.d}[1], [x7], x10
+        SUB     x4, x4, x3              // a -= ks
+        ST1     {v0.8b}, [x6], x10
+
+        # nc loop
+        B.HI    0b
+
+        # Restore d10-d15 from stack
+        LDP     d14, d15, [sp, 32]
+        LDP     d12, d13, [sp, 16]
+        LDP     d10, d11, [sp], 48
+        RET
+
+        # Store odd width
+        .p2align 3
+3:
+        TBZ     x1, 2, 4f
+        ST1     {v0.s}[2], [x7], 4
+        STR     s0, [x6], 4
+        EXT     v0.16b, v0.16b, v0.16b, 4
+
+4:
+        TBZ     x1, 1, 5f
+        ST1     {v0.h}[4], [x7], 2
+        ST1     {v0.h}[0], [x6], 2
+        EXT     v0.16b, v0.16b, v0.16b, 2
+5:
+        TBZ     x1, 0, 6f
+        ST1     {v0.b}[8], [x7]
+        ST1     {v0.b}[0], [x6]
+6:
+        # Restore d10-d15 from stack
+        LDP     d14, d15, [sp, 32]
+        LDP     d12, d13, [sp, 16]
+        LDP     d10, d11, [sp], 48
+        RET
+
+END_FUNCTION xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
+
diff --git a/src/qs8-igemm/gen/2x8c16-minmax-gemmlowp-aarch64-neon-mlal-padal.S b/src/qs8-igemm/gen/2x8c16-minmax-gemmlowp-aarch64-neon-mlal-padal.S
index c5455d4..8fa98c9 100644
--- a/src/qs8-igemm/gen/2x8c16-minmax-gemmlowp-aarch64-neon-mlal-padal.S
+++ b/src/qs8-igemm/gen/2x8c16-minmax-gemmlowp-aarch64-neon-mlal-padal.S
@@ -7,6 +7,7 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
+
 #include <xnnpack/assembly.h>
 
 # void xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal(
@@ -21,7 +22,7 @@
 #     size_t cn_stride,                  [sp] -> x10
 #     size_t a_offset,                   [sp + 8] -> x8
 #     const float* zero,                 [sp + 16] -> x12
-#     const xnn_qs8_conv_minmax_params params [sp + 24] -> x11
+#     const union xnn_qs8_conv_minmax_params params [sp + 24] -> x11
 
 # d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
 
@@ -168,7 +169,6 @@
         SQRDMULH v2.4s, v2.4s, v4.4s
         SQRDMULH v3.4s, v3.4s, v4.4s
         CMEQ    v4.4s, v7.4s, 0
-        LD1R    {v5.8h}, [x11], 2
         BIC     v6.16b, v0.16b, v4.16b
         BIC     v16.16b, v1.16b, v4.16b
         BIC     v17.16b, v2.16b, v4.16b
@@ -181,6 +181,8 @@
         SRSHL   v1.4s, v1.4s, v7.4s
         SRSHL   v2.4s, v2.4s, v7.4s
         SRSHL   v3.4s, v3.4s, v7.4s
+
+        LD1R    {v5.8h}, [x11], 2
         SQXTN   v0.4h, v0.4s
         SQXTN   v2.4h, v2.4s
         SQXTN2  v0.8h, v1.4s
diff --git a/src/xnnpack/gemm.h b/src/xnnpack/gemm.h
index 7c743e7..a8c7f59 100644
--- a/src/xnnpack/gemm.h
+++ b/src/xnnpack/gemm.h
@@ -651,6 +651,8 @@
 DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53)
 DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal)
 
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal)
+
 DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53)
 DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53)
 
@@ -1004,6 +1006,8 @@
 DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_6x16c4__neondot)
 DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_8x16c4__neondot)
 
+DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal)
+
 DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53)
 DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53)
 
diff --git a/src/xnnpack/igemm.h b/src/xnnpack/igemm.h
index aeafe6d..9989252 100644
--- a/src/xnnpack/igemm.h
+++ b/src/xnnpack/igemm.h
@@ -461,6 +461,8 @@
 DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53)
 DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal)
 
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal)
+
 DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53)
 DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53)
 
@@ -753,6 +755,8 @@
 DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_6x16c4__neondot)
 DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_8x16c4__neondot)
 
+DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal)
+
 DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53)
 DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53)
 
diff --git a/test/qc8-gemm-minmax-fp32.cc b/test/qc8-gemm-minmax-fp32.cc
index 42e82bb..e27b48e 100644
--- a/test/qc8-gemm-minmax-fp32.cc
+++ b/test/qc8-gemm-minmax-fp32.cc
@@ -23,6 +23,462 @@
 
 
 #if XNN_ARCH_ARM64
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .cn_stride(11)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .a_stride(19)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_lt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .a_stride(37)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .a_stride(163)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, n_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(83)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, n_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(83)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .cm_stride(11)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
   TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
diff --git a/test/qc8-gemm-minmax-fp32.yaml b/test/qc8-gemm-minmax-fp32.yaml
index fc4a252..0440f76 100644
--- a/test/qc8-gemm-minmax-fp32.yaml
+++ b/test/qc8-gemm-minmax-fp32.yaml
@@ -3,6 +3,9 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal
+  init: xnn_init_qs8_minmax_neon_params
+  k-block: 16
 - name: xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53
   init: xnn_init_qs8_minmax_neon_params
   k-block: 8
diff --git a/test/qc8-igemm-minmax-fp32.cc b/test/qc8-igemm-minmax-fp32.cc
index 8f517b4..5237802 100644
--- a/test/qc8-igemm-minmax-fp32.cc
+++ b/test/qc8-igemm-minmax-fp32.cc
@@ -23,6 +23,474 @@
 
 
 #if XNN_ARCH_ARM64
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .cn_stride(11)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, n_gt_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, n_div_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .a_offset(163)
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 2; mz++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .a_offset(163)
+          .zero_index(mz)
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .cm_stride(11)
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
   TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
diff --git a/test/qc8-igemm-minmax-fp32.yaml b/test/qc8-igemm-minmax-fp32.yaml
index 0ab6b5e..d2e34c2 100644
--- a/test/qc8-igemm-minmax-fp32.yaml
+++ b/test/qc8-igemm-minmax-fp32.yaml
@@ -3,6 +3,9 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal
+  init: xnn_init_qs8_minmax_neon_params
+  k-block: 16
 - name: xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53
   init: xnn_init_qs8_minmax_neon_params
   k-block: 8
diff --git a/test/qs8-gemm-minmax-fp32.cc b/test/qs8-gemm-minmax-fp32.cc
index a7bf802..6269a55 100644
--- a/test/qs8-gemm-minmax-fp32.cc
+++ b/test/qs8-gemm-minmax-fp32.cc
@@ -23,6 +23,462 @@
 
 
 #if XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .cn_stride(11)
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .a_stride(19)
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_lt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .a_stride(37)
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .a_stride(163)
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, n_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(83)
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, n_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(83)
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .cm_stride(11)
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
   TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
diff --git a/test/qs8-gemm-minmax-fp32.yaml b/test/qs8-gemm-minmax-fp32.yaml
index e13389f..945a1fe 100644
--- a/test/qs8-gemm-minmax-fp32.yaml
+++ b/test/qs8-gemm-minmax-fp32.yaml
@@ -3,6 +3,9 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+- name: xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal
+  init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
+  k-block: 16
 - name: xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53
   init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
   k-block: 8
diff --git a/test/qs8-gemm-minmax-gemmlowp.cc b/test/qs8-gemm-minmax-gemmlowp.cc
index 709167f..9d20437 100644
--- a/test/qs8-gemm-minmax-gemmlowp.cc
+++ b/test/qs8-gemm-minmax-gemmlowp.cc
@@ -23,6 +23,462 @@
 
 
 #if XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .cn_stride(11)
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .a_stride(19)
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, k_lt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, k_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .a_stride(37)
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, k_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .a_stride(163)
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, n_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(83)
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, n_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .a_stride(83)
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .cm_stride(11)
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+#endif  // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
   TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
@@ -3670,4110 +4126,6 @@
 #endif  // XNN_ARCH_ARM64
 
 
-#if XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(16)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(16)
-      .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_strided_a) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(16)
-      .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 1; m++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(16)
-          .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 1; m++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(m)
-        .n(8)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(n)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, k_lt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, k_lt_16_strided_a) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, k_lt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, k_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, k_gt_16_strided_a) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, k_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, k_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, k_div_16_strided_a) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, k_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8_strided_a) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8_strided_a) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(11)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(16)
-      .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(16)
-      .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(16)
-      .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-#endif  // XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(16)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(16)
-      .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_strided_a) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(16)
-      .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 1; m++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(16)
-          .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 1; m++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(m)
-        .n(8)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(n)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_lt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_lt_16_strided_a) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_lt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_gt_16_strided_a) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_div_16_strided_a) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8_strided_a) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8_strided_a) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(11)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(16)
-      .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(16)
-      .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(16)
-      .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-#endif  // XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(16)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(16)
-      .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_strided_a) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(16)
-      .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 1; m++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(16)
-          .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 1; m++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(m)
-        .n(8)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(n)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_lt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_lt_16_strided_a) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_lt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_gt_16_strided_a) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_div_16_strided_a) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8_strided_a) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8_strided_a) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(11)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(16)
-      .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(16)
-      .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(16)
-      .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-#endif  // XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(16)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(16)
-      .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_strided_a) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(16)
-      .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 1; m++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(16)
-          .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 1; m++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(m)
-        .n(8)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(n)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_lt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_lt_16_strided_a) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_lt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_gt_16_strided_a) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_div_16_strided_a) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8_strided_a) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8_strided_a) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(11)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(16)
-      .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(16)
-      .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(16)
-      .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-#endif  // XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_strided_a) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 2; m++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(16)
-          .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 2; m++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(m)
-        .n(8)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(n)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_lt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_lt_16_strided_a) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_lt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_gt_16_strided_a) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_div_16_strided_a) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8_strided_a) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8_strided_a) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(11)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-#endif  // XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_strided_a) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 2; m++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(16)
-          .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 2; m++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(m)
-        .n(8)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(n)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_lt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_lt_16_strided_a) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_lt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_gt_16_strided_a) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_div_16_strided_a) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8_strided_a) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8_strided_a) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(11)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-#endif  // XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_strided_a) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 2; m++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(16)
-          .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 2; m++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(m)
-        .n(8)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(n)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, k_lt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, k_lt_16_strided_a) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, k_lt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, k_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, k_gt_16_strided_a) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, k_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, k_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, k_div_16_strided_a) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, k_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8_strided_a) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8_strided_a) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(11)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-#endif  // XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_strided_a) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 2; m++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(16)
-          .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 2; m++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(m)
-        .n(8)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(n)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_lt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_lt_16_strided_a) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_lt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_gt_16_strided_a) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_div_16_strided_a) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8_strided_a) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8_strided_a) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(11)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-#endif  // XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL_PADAL, k_eq_8) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(8)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL_PADAL, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(8)
-      .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL_PADAL, k_eq_8_strided_a) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL_PADAL, k_eq_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 2; m++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(8)
-          .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL_PADAL, k_eq_8_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 2; m++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(m)
-        .n(8)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL_PADAL, k_eq_8_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(n)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL_PADAL, k_lt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL_PADAL, k_lt_8_strided_a) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL_PADAL, k_lt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL_PADAL, k_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL_PADAL, k_gt_8_strided_a) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL_PADAL, k_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL_PADAL, k_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL_PADAL, k_div_8_strided_a) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL_PADAL, k_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL_PADAL, n_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL_PADAL, n_gt_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL_PADAL, n_gt_8_strided_a) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL_PADAL, n_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL_PADAL, n_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL_PADAL, n_div_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL_PADAL, n_div_8_strided_a) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL_PADAL, n_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL_PADAL, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(11)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL_PADAL, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(8)
-      .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL_PADAL, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(8)
-      .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MULL_PADAL, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(8)
-      .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-#endif  // XNN_ARCH_ARM64
-
-
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8__NEON_MLAL_LANE, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
@@ -42430,462 +38782,6 @@
 #endif  // XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64
 
 
-#if XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(16)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(16)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .cn_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16_strided_a) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(16)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 2; m++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(16)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(16)
-          .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 2; m++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(16)
-        .sr(1)
-        .m(m)
-        .n(8)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(16)
-        .sr(1)
-        .m(2)
-        .n(n)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, k_lt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(16)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, k_lt_16_strided_a) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(16)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, k_lt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(16)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, k_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(16)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, k_gt_16_strided_a) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(16)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, k_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(16)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, k_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(16)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, k_div_16_strided_a) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(16)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, k_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(16)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, n_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(16)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, n_gt_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(16)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, n_gt_8_strided_a) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(16)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, n_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(16)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, n_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(16)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, n_div_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(16)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, n_div_8_strided_a) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(16)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, n_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(16)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(16)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(11)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(16)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(16)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(16)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .cm_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-#endif  // XNN_ARCH_ARM64
-
-
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
   TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X4C2__SSE2_LD64, k_eq_8) {
     TEST_REQUIRES_X86_SSE2;
diff --git a/test/qs8-gemm-minmax-gemmlowp.yaml b/test/qs8-gemm-minmax-gemmlowp.yaml
index 9748274..3053ae8 100644
--- a/test/qs8-gemm-minmax-gemmlowp.yaml
+++ b/test/qs8-gemm-minmax-gemmlowp.yaml
@@ -3,6 +3,9 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal
+  init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
+  k-block: 16
 - name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 8
@@ -27,33 +30,6 @@
 - name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal
-  init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
-  k-block: 16
-- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm
-  init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
-  k-block: 16
-- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53
-  init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
-  k-block: 16
-- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53
-  init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
-  k-block: 16
-- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53
-  init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
-  k-block: 16
-- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53
-  init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
-  k-block: 16
-- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal
-  init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
-  k-block: 16
-- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm
-  init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
-  k-block: 16
-- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull_padal
-  init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
-  k-block: 8
 - name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8__neon_mlal_lane
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 8
@@ -282,9 +258,6 @@
 - name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_8x16c4__neondot
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 8
-- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal
-  init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
-  k-block: 16
 - name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x4c2__sse2_ld64
   init: xnn_init_qs8_conv_minmax_gemmlowp_sse2_params
   k-block: 8
diff --git a/test/qs8-igemm-minmax-fp32.cc b/test/qs8-igemm-minmax-fp32.cc
index c60a0c0..9ac4d84 100644
--- a/test/qs8-igemm-minmax-fp32.cc
+++ b/test/qs8-igemm-minmax-fp32.cc
@@ -23,6 +23,474 @@
 
 
 #if XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .cn_stride(11)
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, n_gt_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, n_div_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .a_offset(163)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 2; mz++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .a_offset(163)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_2X8C16__AARCH64_NEON_MLAL_PADAL, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .cm_stride(11)
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
   TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
diff --git a/test/qs8-igemm-minmax-fp32.yaml b/test/qs8-igemm-minmax-fp32.yaml
index c30c31b..b2ecf87 100644
--- a/test/qs8-igemm-minmax-fp32.yaml
+++ b/test/qs8-igemm-minmax-fp32.yaml
@@ -3,6 +3,9 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal_padal
+  init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
+  k-block: 16
 - name: xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53
   init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
   k-block: 8
diff --git a/test/qs8-igemm-minmax-gemmlowp.cc b/test/qs8-igemm-minmax-gemmlowp.cc
index 6ab8d9c..bbb87a5 100644
--- a/test/qs8-igemm-minmax-gemmlowp.cc
+++ b/test/qs8-igemm-minmax-gemmlowp.cc
@@ -23,6 +23,4218 @@
 
 
 #if XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .cn_stride(11)
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .a_offset(163)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 2; mz++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .a_offset(163)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .cm_stride(11)
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+#endif  // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .cn_stride(11)
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .a_offset(163)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 2; mz++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .a_offset(163)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .cm_stride(11)
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+#endif  // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .cn_stride(11)
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .a_offset(163)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 2; mz++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .a_offset(163)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .cm_stride(11)
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+#endif  // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .cn_stride(11)
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .a_offset(163)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 2; mz++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .a_offset(163)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .cm_stride(11)
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+#endif  // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .cn_stride(11)
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, n_gt_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, n_div_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(16)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(16)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .a_offset(163)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 2; mz++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(16)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .a_offset(163)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(16)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .cm_stride(11)
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+#endif  // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(16)
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(16)
+      .cn_stride(11)
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .a_offset(83)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 1; mz++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .a_offset(83)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(16)
+      .cm_stride(11)
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+#endif  // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(16)
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(16)
+      .cn_stride(11)
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .a_offset(83)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 1; mz++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .a_offset(83)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(16)
+      .cm_stride(11)
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+#endif  // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(16)
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(16)
+      .cn_stride(11)
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .a_offset(83)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 1; mz++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .a_offset(83)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(16)
+      .cm_stride(11)
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+#endif  // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(16)
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(16)
+      .cn_stride(11)
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .a_offset(83)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 1; mz++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .a_offset(83)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(16)
+      .cm_stride(11)
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+#endif  // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
   TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
@@ -2362,4218 +6574,6 @@
 #endif  // XNN_ARCH_ARM64
 
 
-#if XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(16)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(16)
-      .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 1; m++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(16)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 1; m++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(m)
-        .n(8)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(n)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, k_lt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, k_lt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, k_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, k_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, k_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, k_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(11)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t mz = 0; mz < 1; mz++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .a_offset(83)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(16)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(16)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(16)
-      .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-#endif  // XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(16)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(16)
-      .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 1; m++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(16)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 1; m++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(m)
-        .n(8)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(n)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_lt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_lt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(11)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t mz = 0; mz < 1; mz++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .a_offset(83)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(16)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(16)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(16)
-      .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-#endif  // XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(16)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(16)
-      .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 1; m++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(16)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 1; m++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(m)
-        .n(8)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(n)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_lt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_lt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(11)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t mz = 0; mz < 1; mz++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .a_offset(83)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(16)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(16)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(16)
-      .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-#endif  // XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(16)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(16)
-      .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 1; m++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(16)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 1; m++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(m)
-        .n(8)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(n)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_lt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_lt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(11)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t mz = 0; mz < 1; mz++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .a_offset(83)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(16)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(16)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(16)
-      .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-#endif  // XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 2; m++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(16)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 2; m++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(m)
-        .n(8)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_eq_16_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(n)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_lt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_lt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, k_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_gt_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, n_div_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(11)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t mz = 0; mz < 2; mz++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .a_offset(163)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_CORTEX_A53, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-#endif  // XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 2; m++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(16)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 2; m++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(m)
-        .n(8)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_eq_16_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(n)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_lt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_lt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, k_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_gt_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, n_div_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(11)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t mz = 0; mz < 2; mz++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .a_offset(163)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM_CORTEX_A53, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-#endif  // XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 2; m++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(16)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 2; m++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(m)
-        .n(8)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(n)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, k_lt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, k_lt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, k_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, k_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, k_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, k_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, n_gt_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, n_div_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(11)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t mz = 0; mz < 2; mz++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .a_offset(163)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-#endif  // XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 2; m++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(16)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 2; m++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(m)
-        .n(8)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_eq_16_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(n)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_lt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_lt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, k_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_gt_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, n_div_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(11)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t mz = 0; mz < 2; mz++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .a_offset(163)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C8__AARCH64_NEON_MLAL_PADAL_PRFM, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-#endif  // XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(16)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(16)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 2; m++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(16)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(16)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 2; m++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(16)
-        .sr(1)
-        .m(m)
-        .n(8)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, k_eq_16_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(16)
-        .sr(1)
-        .m(2)
-        .n(n)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, k_lt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(16)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, k_lt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(16)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, k_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(16)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, k_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(16)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, k_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(16)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, k_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(16)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, n_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(16)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, n_gt_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(16)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, n_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(16)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, n_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(16)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, n_div_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(16)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, n_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(16)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(16)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(16)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, n_gt_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(16)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, n_div_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(16)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(16)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(11)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(16)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t mz = 0; mz < 2; mz++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(16)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .a_offset(163)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(16)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(16)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_2X8C16__AARCH64_NEON_MLAL_PADAL, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(16)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-#endif  // XNN_ARCH_ARM64
-
-
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X8__NEON_MLAL_LANE, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
diff --git a/test/qs8-igemm-minmax-gemmlowp.yaml b/test/qs8-igemm-minmax-gemmlowp.yaml
index 5cc6a7d..d516a1d 100644
--- a/test/qs8-igemm-minmax-gemmlowp.yaml
+++ b/test/qs8-igemm-minmax-gemmlowp.yaml
@@ -3,19 +3,19 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53
-  init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
-  k-block: 8
-- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53
-  init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
-  k-block: 8
-- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55
+- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64
+- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
-  k-block: 8
-- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128
+  k-block: 16
+- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53
+  init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
+  k-block: 16
+- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53
+  init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
+  k-block: 16
+- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 16
 - name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal
@@ -30,19 +30,19 @@
 - name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_cortex_a53
+- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53
+  init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
+  k-block: 8
+- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53
+  init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
+  k-block: 8
+- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 16
-- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53
+- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
-  k-block: 16
-- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal
-  init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
-  k-block: 16
-- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_padal_prfm
-  init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
-  k-block: 16
-- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal
+  k-block: 8
+- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 16
 - name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8__neon_mlal_lane