NEON Dot Product implementations of QS8 FP32 c4 GEMM and IGEMM assembly microkernels

PiperOrigin-RevId: 382208248
diff --git a/BUILD.bazel b/BUILD.bazel
index 21c8af5..58f1281 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -4279,6 +4279,8 @@
     "src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm-cortex-a53.S",
     "src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm.S",
     "src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal.S",
+    "src/qs8-gemm/gen/1x16c4-minmax-fp32-aarch64-neondot-ld32.S",
+    "src/qs8-gemm/gen/1x16c4-minmax-fp32-aarch64-neondot-ld64.S",
     "src/qs8-gemm/gen/1x16c4-minmax-gemmlowp-aarch64-neondot-ld32.S",
     "src/qs8-gemm/gen/1x16c4-minmax-gemmlowp-aarch64-neondot-ld64.S",
     "src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-cortex-a53.S",
@@ -4289,11 +4291,14 @@
     "src/qs8-gemm/gen/2x8c16-minmax-gemmlowp-aarch64-neon-mlal-padal.S",
     "src/qs8-gemm/gen/4x16-minmax-gemmlowp-aarch64-neon-mlal-lane-cortex-a53.S",
     "src/qs8-gemm/gen/4x16-minmax-gemmlowp-aarch64-neon-mlal-lane-prfm-cortex-a53.S",
+    "src/qs8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-cortex-a55.S",
+    "src/qs8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld32.S",
+    "src/qs8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld64.S",
+    "src/qs8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld128.S",
     "src/qs8-gemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-cortex-a55.S",
     "src/qs8-gemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld32.S",
     "src/qs8-gemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld64.S",
     "src/qs8-gemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld128.S",
-    "src/qs8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld128.S",
     "src/qs8-igemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-cortex-a53.S",
     "src/qs8-igemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm-cortex-a53.S",
     "src/qs8-igemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm.S",
@@ -4305,6 +4310,9 @@
     "src/qs8-igemm/gen/2x8c16-minmax-gemmlowp-aarch64-neon-mlal-padal.S",
     "src/qs8-igemm/gen/4x16-minmax-gemmlowp-aarch64-neon-mlal-lane-cortex-a53.S",
     "src/qs8-igemm/gen/4x16-minmax-gemmlowp-aarch64-neon-mlal-lane-prfm-cortex-a53.S",
+    "src/qs8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-cortex-a55.S",
+    "src/qs8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld64.S",
+    "src/qs8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld128.S",
     "src/qs8-igemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-cortex-a55.S",
     "src/qs8-igemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld64.S",
     "src/qs8-igemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld128.S",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e5d7517..220175e 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3503,6 +3503,8 @@
   src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm-cortex-a53.S
   src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm.S
   src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal.S
+  src/qs8-gemm/gen/1x16c4-minmax-fp32-aarch64-neondot-ld32.S
+  src/qs8-gemm/gen/1x16c4-minmax-fp32-aarch64-neondot-ld64.S
   src/qs8-gemm/gen/1x16c4-minmax-gemmlowp-aarch64-neondot-ld32.S
   src/qs8-gemm/gen/1x16c4-minmax-gemmlowp-aarch64-neondot-ld64.S
   src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-cortex-a53.S
@@ -3513,11 +3515,14 @@
   src/qs8-gemm/gen/2x8c16-minmax-gemmlowp-aarch64-neon-mlal-padal.S
   src/qs8-gemm/gen/4x16-minmax-gemmlowp-aarch64-neon-mlal-lane-cortex-a53.S
   src/qs8-gemm/gen/4x16-minmax-gemmlowp-aarch64-neon-mlal-lane-prfm-cortex-a53.S
+  src/qs8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-cortex-a55.S
+  src/qs8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld32.S
+  src/qs8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld64.S
+  src/qs8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld128.S
   src/qs8-gemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-cortex-a55.S
   src/qs8-gemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld32.S
   src/qs8-gemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld64.S
   src/qs8-gemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld128.S
-  src/qs8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld128.S
   src/qs8-igemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-cortex-a53.S
   src/qs8-igemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm-cortex-a53.S
   src/qs8-igemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm.S
@@ -3529,6 +3534,9 @@
   src/qs8-igemm/gen/2x8c16-minmax-gemmlowp-aarch64-neon-mlal-padal.S
   src/qs8-igemm/gen/4x16-minmax-gemmlowp-aarch64-neon-mlal-lane-cortex-a53.S
   src/qs8-igemm/gen/4x16-minmax-gemmlowp-aarch64-neon-mlal-lane-prfm-cortex-a53.S
+  src/qs8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-cortex-a55.S
+  src/qs8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld64.S
+  src/qs8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld128.S
   src/qs8-igemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-cortex-a55.S
   src/qs8-igemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld64.S
   src/qs8-igemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld128.S)
diff --git a/scripts/generate-qs8-gemm.sh b/scripts/generate-qs8-gemm.sh
index eed993a..33fec41 100755
--- a/scripts/generate-qs8-gemm.sh
+++ b/scripts/generate-qs8-gemm.sh
@@ -197,15 +197,22 @@
 tools/xngen src/qs8-gemm/2x8c8-aarch64-neon-mlal-padal.S.in -D PREFETCH=0 -o src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal.S
 tools/xngen src/qs8-gemm/2x8c8-aarch64-neon-mlal-padal.S.in -D PREFETCH=1 -o src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm.S
 
-tools/xngen src/qs8-gemm/1x16c4-aarch64-neondot-ld32.S.in       -o src/qs8-gemm/gen/1x16c4-minmax-gemmlowp-aarch64-neondot-ld32.S
-tools/xngen src/qs8-gemm/1x16c4-aarch64-neondot-ld64.S.in       -o src/qs8-gemm/gen/1x16c4-minmax-gemmlowp-aarch64-neondot-ld64.S
 tools/xngen src/qs8-gemm/2x8c16-aarch64-neon-mlal-padal.S.in    -o src/qs8-gemm/gen/2x8c16-minmax-gemmlowp-aarch64-neon-mlal-padal.S
 tools/xngen src/qs8-gemm/2x8c8-aarch64-neon-mull-padal.S.in     -o src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mull-padal.S
-tools/xngen src/qs8-gemm/4x16c4-aarch64-neondot-cortex-a55.S.in -o src/qs8-gemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-cortex-a55.S
-tools/xngen src/qs8-gemm/4x16c4-aarch64-neondot-ld32.S.in       -o src/qs8-gemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld32.S
-tools/xngen src/qs8-gemm/4x16c4-aarch64-neondot-ld64.S.in       -o src/qs8-gemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld64.S
 
+### C4 micro-kernels
+tools/xngen src/qs8-gemm/1x16c4-aarch64-neondot-ld32.S.in       -D REQUANTIZATION=GEMMLOWP -o src/qs8-gemm/gen/1x16c4-minmax-gemmlowp-aarch64-neondot-ld32.S
+tools/xngen src/qs8-gemm/1x16c4-aarch64-neondot-ld64.S.in       -D REQUANTIZATION=GEMMLOWP -o src/qs8-gemm/gen/1x16c4-minmax-gemmlowp-aarch64-neondot-ld64.S
+tools/xngen src/qs8-gemm/4x16c4-aarch64-neondot-cortex-a55.S.in -D REQUANTIZATION=GEMMLOWP -o src/qs8-gemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-cortex-a55.S
+tools/xngen src/qs8-gemm/4x16c4-aarch64-neondot-ld32.S.in       -D REQUANTIZATION=GEMMLOWP -o src/qs8-gemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld32.S
+tools/xngen src/qs8-gemm/4x16c4-aarch64-neondot-ld64.S.in       -D REQUANTIZATION=GEMMLOWP -o src/qs8-gemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld64.S
 tools/xngen src/qs8-gemm/4x16c4-aarch64-neondot-ld128.S.in      -D REQUANTIZATION=GEMMLOWP -o src/qs8-gemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld128.S
+
+tools/xngen src/qs8-gemm/1x16c4-aarch64-neondot-ld32.S.in       -D REQUANTIZATION=FP32     -o src/qs8-gemm/gen/1x16c4-minmax-fp32-aarch64-neondot-ld32.S
+tools/xngen src/qs8-gemm/1x16c4-aarch64-neondot-ld64.S.in       -D REQUANTIZATION=FP32     -o src/qs8-gemm/gen/1x16c4-minmax-fp32-aarch64-neondot-ld64.S
+tools/xngen src/qs8-gemm/4x16c4-aarch64-neondot-cortex-a55.S.in -D REQUANTIZATION=FP32     -o src/qs8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-cortex-a55.S
+tools/xngen src/qs8-gemm/4x16c4-aarch64-neondot-ld32.S.in       -D REQUANTIZATION=FP32     -o src/qs8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld32.S
+tools/xngen src/qs8-gemm/4x16c4-aarch64-neondot-ld64.S.in       -D REQUANTIZATION=FP32     -o src/qs8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld64.S
 tools/xngen src/qs8-gemm/4x16c4-aarch64-neondot-ld128.S.in      -D REQUANTIZATION=FP32     -o src/qs8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld128.S
 
 ################################### x86 SSE ###################################
diff --git a/scripts/generate-qs8-igemm.sh b/scripts/generate-qs8-igemm.sh
index 4387185..40df3ec 100755
--- a/scripts/generate-qs8-igemm.sh
+++ b/scripts/generate-qs8-igemm.sh
@@ -195,9 +195,15 @@
 tools/xngen src/qs8-igemm/2x8c8-aarch64-neon-mlal-padal.S.in -D PREFETCH=1 -o src/qs8-igemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm.S
 
 tools/xngen src/qs8-igemm/2x8c16-aarch64-neon-mlal-padal.S.in    -o src/qs8-igemm/gen/2x8c16-minmax-gemmlowp-aarch64-neon-mlal-padal.S
-tools/xngen src/qs8-igemm/4x16c4-aarch64-neondot-cortex-a55.S.in -o src/qs8-igemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-cortex-a55.S
-tools/xngen src/qs8-igemm/4x16c4-aarch64-neondot-ld64.S.in       -o src/qs8-igemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld64.S
-tools/xngen src/qs8-igemm/4x16c4-aarch64-neondot-ld128.S.in      -o src/qs8-igemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld128.S
+
+### C4 micro-kernels
+tools/xngen src/qs8-igemm/4x16c4-aarch64-neondot-cortex-a55.S.in -D REQUANTIZATION=GEMMLOWP -o src/qs8-igemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-cortex-a55.S
+tools/xngen src/qs8-igemm/4x16c4-aarch64-neondot-ld64.S.in       -D REQUANTIZATION=GEMMLOWP -o src/qs8-igemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld64.S
+tools/xngen src/qs8-igemm/4x16c4-aarch64-neondot-ld128.S.in      -D REQUANTIZATION=GEMMLOWP -o src/qs8-igemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld128.S
+
+tools/xngen src/qs8-igemm/4x16c4-aarch64-neondot-cortex-a55.S.in -D REQUANTIZATION=FP32     -o src/qs8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-cortex-a55.S
+tools/xngen src/qs8-igemm/4x16c4-aarch64-neondot-ld64.S.in       -D REQUANTIZATION=FP32     -o src/qs8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld64.S
+tools/xngen src/qs8-igemm/4x16c4-aarch64-neondot-ld128.S.in      -D REQUANTIZATION=FP32     -o src/qs8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld128.S
 
 ################################### x86 SSE ###################################
 ### C2 micro-kernels
diff --git a/src/qs8-gemm/1x16c4-aarch64-neondot-ld32.S.in b/src/qs8-gemm/1x16c4-aarch64-neondot-ld32.S.in
index 3d75e39..3043049 100644
--- a/src/qs8-gemm/1x16c4-aarch64-neondot-ld32.S.in
+++ b/src/qs8-gemm/1x16c4-aarch64-neondot-ld32.S.in
@@ -5,7 +5,10 @@
 
 #include <xnnpack/assembly.h>
 
-# void xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32(
+$assert REQUANTIZATION in ["GEMMLOWP", "FP32"]
+$CONV_PARAMS = "xnn_qs8_minmax_params"
+
+# void xnn_qs8_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_1x16c4__aarch64_neondot_ld32(
 #     size_t mr,                 x0
 #     size_t nc,                 x1
 #     size_t kc,                 x2 / x0
@@ -15,7 +18,7 @@
 #     int8_t* restrict c,        x6
 #     size_t cm_stride,          (x7)
 #     size_t cn_stride,          [sp] -> x12
-#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x11
+#     const union ${CONV_PARAMS} params)  [sp + 8] -> x11
 
 # d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
 
@@ -25,7 +28,7 @@
 # C0  x6 v28 v29 v30 v31
 # unused v4 v5 v6 v7 v8 v9 v10 v11 v12 v13 v14 v15
 
-BEGIN_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32
+BEGIN_FUNCTION xnn_qs8_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_1x16c4__aarch64_neondot_ld32
 0:
         # Load initial bias from w into accumulators
         ADD     x2, x2, 3               // kc = (kc + 3) & ~3
@@ -50,30 +53,54 @@
         SDOT    v31.4s, v19.16b, v0.4b[0]
         B.HI    1b
 
-        # Apply params - scale, shift, bias and clamp
-        LD2R    {v0.4s, v1.4s}, [x11], 8
-        CMEQ    v2.4s, v1.4s, 0
-        SQRDMULH v4.4s, v28.4s, v0.4s
-        SQRDMULH v5.4s, v29.4s, v0.4s
-        SQRDMULH v6.4s, v30.4s, v0.4s
-        SQRDMULH v7.4s, v31.4s, v0.4s
-        BIC     v28.16b, v28.16b, v2.16b
-        BIC     v29.16b, v29.16b, v2.16b
-        BIC     v30.16b, v30.16b, v2.16b
-        BIC     v31.16b, v31.16b, v2.16b
-        SSRA    v4.4s, v28.4s, 31       // signed shift right accumulate
-        SSRA    v5.4s, v29.4s, 31
-        SSRA    v6.4s, v30.4s, 31
-        SSRA    v7.4s, v31.4s, 31
-        SRSHL   v4.4s, v4.4s, v1.4s     // signed rounding shift left
-        SRSHL   v5.4s, v5.4s, v1.4s
-        SRSHL   v6.4s, v6.4s, v1.4s
-        SRSHL   v7.4s, v7.4s, v1.4s
+        $if REQUANTIZATION == "GEMMLOWP":
+          # Apply params - scale, shift, bias and clamp
+          LD2R    {v0.4s, v1.4s}, [x11], 8
+          CMEQ    v2.4s, v1.4s, 0
+
+          BIC     v4.16b, v28.16b, v2.16b
+          BIC     v5.16b, v29.16b, v2.16b
+          BIC     v6.16b, v30.16b, v2.16b
+          BIC     v7.16b, v31.16b, v2.16b
+
+          SQRDMULH v28.4s, v28.4s, v0.4s
+          SQRDMULH v29.4s, v29.4s, v0.4s
+          SQRDMULH v30.4s, v30.4s, v0.4s
+          SQRDMULH v31.4s, v31.4s, v0.4s
+
+          SSRA    v28.4s, v4.4s, 31
+          SSRA    v29.4s, v5.4s, 31
+          SSRA    v30.4s, v6.4s, 31
+          SSRA    v31.4s, v7.4s, 31
+
+          SRSHL   v28.4s, v28.4s, v1.4s   // signed rounding shift left
+          SRSHL   v29.4s, v29.4s, v1.4s
+          SRSHL   v30.4s, v30.4s, v1.4s
+          SRSHL   v31.4s, v31.4s, v1.4s
+        $elif REQUANTIZATION == "FP32":
+          # Apply params - scale, bias and clamp
+          LD1R    {v0.4s}, [x11], 4
+
+          SCVTF   v28.4s, v28.4s
+          SCVTF   v29.4s, v29.4s
+          SCVTF   v30.4s, v30.4s
+          SCVTF   v31.4s, v31.4s
+
+          FMUL    v28.4s, v0.4s, v28.4s
+          FMUL    v29.4s, v0.4s, v29.4s
+          FMUL    v30.4s, v0.4s, v30.4s
+          FMUL    v31.4s, v0.4s, v31.4s
+
+          FCVTNS  v28.4s, v28.4s
+          FCVTNS  v29.4s, v29.4s
+          FCVTNS  v30.4s, v30.4s
+          FCVTNS  v31.4s, v31.4s
+
         LD1R    {v2.8h}, [x11], 2       // add bias
-        SQXTN   v4.4h, v4.4s
-        SQXTN   v6.4h, v6.4s
-        SQXTN2  v4.8h, v5.4s
-        SQXTN2  v6.8h, v7.4s
+        SQXTN   v4.4h, v28.4s
+        SQXTN   v6.4h, v30.4s
+        SQXTN2  v4.8h, v29.4s
+        SQXTN2  v6.8h, v31.4s
         LD2R    {v0.16b, v1.16b}, [x11]   // clamp to min/max
         SQADD   v4.8h, v4.8h, v2.8h
         SQADD   v6.8h, v6.8h, v2.8h
@@ -111,7 +138,7 @@
 6:
         RET
 
-END_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32
+END_FUNCTION xnn_qs8_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_1x16c4__aarch64_neondot_ld32
 
 #ifdef __ELF__
 .section ".note.GNU-stack","",%progbits
diff --git a/src/qs8-gemm/1x16c4-aarch64-neondot-ld64.S.in b/src/qs8-gemm/1x16c4-aarch64-neondot-ld64.S.in
index a7973d6..a9a72b5 100644
--- a/src/qs8-gemm/1x16c4-aarch64-neondot-ld64.S.in
+++ b/src/qs8-gemm/1x16c4-aarch64-neondot-ld64.S.in
@@ -5,7 +5,10 @@
 
 #include <xnnpack/assembly.h>
 
-# void xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64(
+$assert REQUANTIZATION in ["GEMMLOWP", "FP32"]
+$CONV_PARAMS = "xnn_qs8_minmax_params"
+
+# void xnn_qs8_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_1x16c4__aarch64_neondot_ld64(
 #     size_t mr,                 x0
 #     size_t nc,                 x1
 #     size_t kc,                 x2 / x0
@@ -15,7 +18,7 @@
 #     int8_t* restrict c,        x6
 #     size_t cm_stride,          (x7)
 #     size_t cn_stride,          [sp] -> x12
-#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x11
+#     const union ${CONV_PARAMS} params)  [sp + 8] -> x11
 
 # d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
 
@@ -25,7 +28,7 @@
 # C0  x6 v28 v29 v30 v31
 # unused v8 v9 v10 v11 v12 v13 v14 v15
 
-BEGIN_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64
+BEGIN_FUNCTION xnn_qs8_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_1x16c4__aarch64_neondot_ld64
         ADD     x2, x2, 3               // kc = (kc + 3) & ~3
         BIC     x2, x2, 3
 
@@ -68,30 +71,56 @@
         TBNZ    x0, 2, 3f
 
 2:
-         # Apply params - scale, shift, bias and clamp
-        LD2R    {v0.4s, v1.4s}, [x11], 8
-        SQRDMULH v4.4s, v28.4s, v0.4s
-        SQRDMULH v5.4s, v29.4s, v0.4s
-        CMEQ    v2.4s, v1.4s, 0
-        SQRDMULH v6.4s, v30.4s, v0.4s
-        SQRDMULH v7.4s, v31.4s, v0.4s
-        BIC     v28.16b, v28.16b, v2.16b
-        BIC     v29.16b, v29.16b, v2.16b
-        BIC     v30.16b, v30.16b, v2.16b
-        BIC     v31.16b, v31.16b, v2.16b
-        SSRA    v4.4s, v28.4s, 31       // signed shift right accumulate
-        SSRA    v5.4s, v29.4s, 31
-        SSRA    v6.4s, v30.4s, 31
-        SSRA    v7.4s, v31.4s, 31
-        SRSHL   v4.4s, v4.4s, v1.4s     // signed rounding shift left
-        SRSHL   v5.4s, v5.4s, v1.4s
-        SRSHL   v6.4s, v6.4s, v1.4s
-        SRSHL   v7.4s, v7.4s, v1.4s
+
+        $if REQUANTIZATION == "GEMMLOWP":
+          # Apply params - scale, shift, bias and clamp
+          LD2R    {v0.4s, v1.4s}, [x11], 8
+          CMEQ    v2.4s, v1.4s, 0
+
+          BIC     v4.16b, v28.16b, v2.16b
+          BIC     v5.16b, v29.16b, v2.16b
+          BIC     v6.16b, v30.16b, v2.16b
+          BIC     v7.16b, v31.16b, v2.16b
+
+          SQRDMULH v28.4s, v28.4s, v0.4s
+          SQRDMULH v29.4s, v29.4s, v0.4s
+          SQRDMULH v30.4s, v30.4s, v0.4s
+          SQRDMULH v31.4s, v31.4s, v0.4s
+
+          SSRA    v28.4s, v4.4s, 31
+          SSRA    v29.4s, v5.4s, 31
+          SSRA    v30.4s, v6.4s, 31
+          SSRA    v31.4s, v7.4s, 31
+
+          SRSHL   v28.4s, v28.4s, v1.4s   // signed rounding shift left
+          SRSHL   v29.4s, v29.4s, v1.4s
+          SRSHL   v30.4s, v30.4s, v1.4s
+          SRSHL   v31.4s, v31.4s, v1.4s
+        $elif REQUANTIZATION == "FP32":
+          # Apply params - scale, bias and clamp
+          LD1R    {v0.4s}, [x11], 4
+
+          SCVTF   v28.4s, v28.4s
+          SCVTF   v29.4s, v29.4s
+          SCVTF   v30.4s, v30.4s
+          SCVTF   v31.4s, v31.4s
+
+          FMUL    v28.4s, v0.4s, v28.4s
+          FMUL    v29.4s, v0.4s, v29.4s
+          FMUL    v30.4s, v0.4s, v30.4s
+          FMUL    v31.4s, v0.4s, v31.4s
+
+          FCVTNS  v28.4s, v28.4s
+          FCVTNS  v29.4s, v29.4s
+          FCVTNS  v30.4s, v30.4s
+          FCVTNS  v31.4s, v31.4s
+
         LD1R    {v2.8h}, [x11], 2       // add bias
-        SQXTN   v4.4h, v4.4s
-        SQXTN   v6.4h, v6.4s
-        SQXTN2  v4.8h, v5.4s
-        SQXTN2  v6.8h, v7.4s
+        SQXTN   v4.4h, v28.4s
+        SQXTN   v6.4h, v30.4s
+        SQXTN2  v4.8h, v29.4s
+        SQXTN2  v6.8h, v31.4s
+
         LD2R    {v0.16b, v1.16b}, [x11]   // clamp to min/max
         SQADD   v4.8h, v4.8h, v2.8h
         SQADD   v6.8h, v6.8h, v2.8h
@@ -145,7 +174,7 @@
 8:
         RET
 
-END_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64
+END_FUNCTION xnn_qs8_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_1x16c4__aarch64_neondot_ld64
 
 #ifdef __ELF__
 .section ".note.GNU-stack","",%progbits
diff --git a/src/qs8-gemm/4x16c4-aarch64-neondot-cortex-a55.S.in b/src/qs8-gemm/4x16c4-aarch64-neondot-cortex-a55.S.in
index d2eae7e..21d6f46 100644
--- a/src/qs8-gemm/4x16c4-aarch64-neondot-cortex-a55.S.in
+++ b/src/qs8-gemm/4x16c4-aarch64-neondot-cortex-a55.S.in
@@ -5,7 +5,10 @@
 
 #include <xnnpack/assembly.h>
 
-# void xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55(
+$assert REQUANTIZATION in ["GEMMLOWP", "FP32"]
+$CONV_PARAMS = "xnn_qs8_minmax_params"
+
+# void xnn_qs8_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_cortex_a55(
 #     size_t mr,                 x0
 #     size_t nc,                 x1
 #     size_t kc,                 x2 / x0
@@ -15,7 +18,7 @@
 #     int8_t* restrict c,        x6
 #     size_t cm_stride,          x7
 #     size_t cn_stride,          [sp] -> x12
-#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x11
+#     const union ${CONV_PARAMS} params)  [sp + 8] -> x11
 
 # d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
 
@@ -31,7 +34,7 @@
 # C3  x7 v19 v23 v27 v31
 # unused v12 v13 v14 v15
 
-BEGIN_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55
+BEGIN_FUNCTION xnn_qs8_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_cortex_a55
 
         # Clamp A and C pointers
         CMP     x0, 2                   // if mr < 2
@@ -65,7 +68,7 @@
         LDP     q24, q28, [x5], 32
         MOV     v19.16b, v16.16b
         MOV     v21.16b, v20.16b
-        LDR     x11, [sp, 40]           // params
+        LDR     x11, [sp, 40]           // reload params
         MOV     v22.16b, v20.16b
         MOV     v23.16b, v20.16b
         MOV     v25.16b, v24.16b
@@ -403,131 +406,191 @@
 
         .p2align 3
 3:
-        # Apply params - scale, shift, bias and clamp
-        LD1R    {v0.4s}, [x11], 4
-        SQRDMULH v4.4s, v16.4s, v0.4s
-        SQRDMULH v5.4s, v17.4s, v0.4s
-        LD1R    {v1.4s}, [x11], 4
-        SQRDMULH v6.4s, v18.4s, v0.4s
-        SQRDMULH v7.4s, v19.4s, v0.4s
-        SQRDMULH v8.4s, v20.4s, v0.4s
-        SQRDMULH v9.4s, v21.4s, v0.4s
-        CMEQ    v2.4s, v1.4s, 0
-        SQRDMULH v10.4s, v22.4s, v0.4s
-        SQRDMULH v11.4s, v23.4s, v0.4s
+        $if REQUANTIZATION == "GEMMLOWP":
+          # Apply params - scale, shift, bias and clamp
+          LD2R    {v0.4s, v1.4s}, [x11], 8
+          CMEQ    v2.4s, v1.4s, 0
 
-        BIC     v16.16b, v16.16b, v2.16b
-        BIC     v17.16b, v17.16b, v2.16b
-        BIC     v18.16b, v18.16b, v2.16b
-        BIC     v19.16b, v19.16b, v2.16b
-        BIC     v20.16b, v20.16b, v2.16b
-        BIC     v21.16b, v21.16b, v2.16b
-        BIC     v22.16b, v22.16b, v2.16b
-        BIC     v23.16b, v23.16b, v2.16b
+          BIC     v4.16b, v16.16b, v2.16b
+          BIC     v5.16b, v17.16b, v2.16b
+          BIC     v6.16b, v18.16b, v2.16b
+          BIC     v7.16b, v19.16b, v2.16b
 
-        SSRA    v4.4s, v16.4s, 31       // signed shift right accumulate
-        SSRA    v5.4s, v17.4s, 31
-        SSRA    v6.4s, v18.4s, 31
-        SSRA    v7.4s, v19.4s, 31
-        SSRA    v8.4s, v20.4s, 31
-        SSRA    v9.4s, v21.4s, 31
-        SSRA    v10.4s, v22.4s, 31
-        SSRA    v11.4s, v23.4s, 31
+          SQRDMULH v16.4s, v16.4s, v0.4s
+          SQRDMULH v17.4s, v17.4s, v0.4s
+          SQRDMULH v18.4s, v18.4s, v0.4s
+          SQRDMULH v19.4s, v19.4s, v0.4s
 
-        SQRDMULH v16.4s, v24.4s, v0.4s
-        SQRDMULH v17.4s, v25.4s, v0.4s
-        SQRDMULH v18.4s, v26.4s, v0.4s
-        SQRDMULH v19.4s, v27.4s, v0.4s
-        SQRDMULH v20.4s, v28.4s, v0.4s
-        SQRDMULH v21.4s, v29.4s, v0.4s
-        SQRDMULH v22.4s, v30.4s, v0.4s
-        SQRDMULH v23.4s, v31.4s, v0.4s
+          SSRA    v16.4s, v4.4s, 31       // signed shift right accumulate
+          SSRA    v17.4s, v5.4s, 31
+          SSRA    v18.4s, v6.4s, 31
+          SSRA    v19.4s, v7.4s, 31
 
-        BIC     v24.16b, v24.16b, v2.16b
-        BIC     v25.16b, v25.16b, v2.16b
-        BIC     v26.16b, v26.16b, v2.16b
-        BIC     v27.16b, v27.16b, v2.16b
-        BIC     v28.16b, v28.16b, v2.16b
-        BIC     v29.16b, v29.16b, v2.16b
-        BIC     v30.16b, v30.16b, v2.16b
-        BIC     v31.16b, v31.16b, v2.16b
+          BIC     v4.16b, v20.16b, v2.16b
+          BIC     v5.16b, v21.16b, v2.16b
+          BIC     v6.16b, v22.16b, v2.16b
+          BIC     v7.16b, v23.16b, v2.16b
 
-        SSRA    v16.4s, v24.4s, 31
-        SSRA    v17.4s, v25.4s, 31
-        SSRA    v18.4s, v26.4s, 31
-        SSRA    v19.4s, v27.4s, 31
-        SSRA    v20.4s, v28.4s, 31
-        SSRA    v21.4s, v29.4s, 31
-        SSRA    v22.4s, v30.4s, 31
-        SSRA    v23.4s, v31.4s, 31
+          SQRDMULH v20.4s, v20.4s, v0.4s
+          SQRDMULH v21.4s, v21.4s, v0.4s
+          SQRDMULH v22.4s, v22.4s, v0.4s
+          SQRDMULH v23.4s, v23.4s, v0.4s
 
-        SRSHL   v4.4s,  v4.4s, v1.4s    // signed rounding shift left
-        SRSHL   v5.4s,  v5.4s, v1.4s
-        SRSHL   v6.4s,  v6.4s, v1.4s
-        SRSHL   v7.4s,  v7.4s, v1.4s
-        SRSHL   v8.4s,  v8.4s, v1.4s
-        SRSHL   v9.4s,  v9.4s, v1.4s
-        SRSHL   v10.4s, v10.4s, v1.4s
-        SRSHL   v11.4s, v11.4s, v1.4s
+          SSRA    v20.4s, v4.4s, 31
+          SSRA    v21.4s, v5.4s, 31
+          SSRA    v22.4s, v6.4s, 31
+          SSRA    v23.4s, v7.4s, 31
 
-        SRSHL   v16.4s, v16.4s, v1.4s
-        SRSHL   v17.4s, v17.4s, v1.4s
-        SRSHL   v18.4s, v18.4s, v1.4s
-        SRSHL   v19.4s, v19.4s, v1.4s
-        SRSHL   v20.4s, v20.4s, v1.4s
-        SRSHL   v21.4s, v21.4s, v1.4s
-        SRSHL   v22.4s, v22.4s, v1.4s
-        SRSHL   v23.4s, v23.4s, v1.4s
+          BIC     v4.16b, v24.16b, v2.16b
+          BIC     v5.16b, v25.16b, v2.16b
+          BIC     v6.16b, v26.16b, v2.16b
+          BIC     v7.16b, v27.16b, v2.16b
 
-        SQXTN   v4.4h,  v4.4s
-        SQXTN   v5.4h,  v5.4s
-        SQXTN   v6.4h,  v6.4s
-        SQXTN   v7.4h,  v7.4s
+          SQRDMULH v24.4s, v24.4s, v0.4s
+          SQRDMULH v25.4s, v25.4s, v0.4s
+          SQRDMULH v26.4s, v26.4s, v0.4s
+          SQRDMULH v27.4s, v27.4s, v0.4s
+
+          SSRA    v24.4s, v4.4s, 31
+          SSRA    v25.4s, v5.4s, 31
+          SSRA    v26.4s, v6.4s, 31
+          SSRA    v27.4s, v7.4s, 31
+
+          BIC     v4.16b, v28.16b, v2.16b
+          BIC     v5.16b, v29.16b, v2.16b
+          BIC     v6.16b, v30.16b, v2.16b
+          BIC     v7.16b, v31.16b, v2.16b
+
+          SQRDMULH v28.4s, v28.4s, v0.4s
+          SQRDMULH v29.4s, v29.4s, v0.4s
+          SQRDMULH v30.4s, v30.4s, v0.4s
+          SQRDMULH v31.4s, v31.4s, v0.4s
+
+          SSRA    v28.4s, v4.4s, 31
+          SSRA    v29.4s, v5.4s, 31
+          SSRA    v30.4s, v6.4s, 31
+          SSRA    v31.4s, v7.4s, 31
+
+          SRSHL   v16.4s, v16.4s, v1.4s   // signed rounding shift left
+          SRSHL   v17.4s, v17.4s, v1.4s
+          SRSHL   v18.4s, v18.4s, v1.4s
+          SRSHL   v19.4s, v19.4s, v1.4s
+          SRSHL   v20.4s, v20.4s, v1.4s
+          SRSHL   v21.4s, v21.4s, v1.4s
+          SRSHL   v22.4s, v22.4s, v1.4s
+          SRSHL   v23.4s, v23.4s, v1.4s
+          SRSHL   v24.4s, v24.4s, v1.4s
+          SRSHL   v25.4s, v25.4s, v1.4s
+          SRSHL   v26.4s, v26.4s, v1.4s
+          SRSHL   v27.4s, v27.4s, v1.4s
+          SRSHL   v28.4s, v28.4s, v1.4s
+          SRSHL   v29.4s, v29.4s, v1.4s
+          SRSHL   v30.4s, v30.4s, v1.4s
+          SRSHL   v31.4s, v31.4s, v1.4s
+        $elif REQUANTIZATION == "FP32":
+          # Apply params - scale, bias and clamp
+          LD1R    {v0.4s}, [x11], 4
+
+          SCVTF   v16.4s, v16.4s
+          SCVTF   v17.4s, v17.4s
+          SCVTF   v18.4s, v18.4s
+          SCVTF   v19.4s, v19.4s
+          SCVTF   v20.4s, v20.4s
+          SCVTF   v21.4s, v21.4s
+          SCVTF   v22.4s, v22.4s
+          SCVTF   v23.4s, v23.4s
+          SCVTF   v24.4s, v24.4s
+          SCVTF   v25.4s, v25.4s
+          SCVTF   v26.4s, v26.4s
+          SCVTF   v27.4s, v27.4s
+          SCVTF   v28.4s, v28.4s
+          SCVTF   v29.4s, v29.4s
+          SCVTF   v30.4s, v30.4s
+          SCVTF   v31.4s, v31.4s
+
+          FMUL    v16.4s, v0.4s, v16.4s
+          FMUL    v17.4s, v0.4s, v17.4s
+          FMUL    v18.4s, v0.4s, v18.4s
+          FMUL    v19.4s, v0.4s, v19.4s
+          FMUL    v20.4s, v0.4s, v20.4s
+          FMUL    v21.4s, v0.4s, v21.4s
+          FMUL    v22.4s, v0.4s, v22.4s
+          FMUL    v23.4s, v0.4s, v23.4s
+          FMUL    v24.4s, v0.4s, v24.4s
+          FMUL    v25.4s, v0.4s, v25.4s
+          FMUL    v26.4s, v0.4s, v26.4s
+          FMUL    v27.4s, v0.4s, v27.4s
+          FMUL    v28.4s, v0.4s, v28.4s
+          FMUL    v29.4s, v0.4s, v29.4s
+          FMUL    v30.4s, v0.4s, v30.4s
+          FMUL    v31.4s, v0.4s, v31.4s
+
+          FCVTNS  v16.4s, v16.4s
+          FCVTNS  v17.4s, v17.4s
+          FCVTNS  v18.4s, v18.4s
+          FCVTNS  v19.4s, v19.4s
+          FCVTNS  v20.4s, v20.4s
+          FCVTNS  v21.4s, v21.4s
+          FCVTNS  v22.4s, v22.4s
+          FCVTNS  v23.4s, v23.4s
+          FCVTNS  v24.4s, v24.4s
+          FCVTNS  v25.4s, v25.4s
+          FCVTNS  v26.4s, v26.4s
+          FCVTNS  v27.4s, v27.4s
+          FCVTNS  v28.4s, v28.4s
+          FCVTNS  v29.4s, v29.4s
+          FCVTNS  v30.4s, v30.4s
+          FCVTNS  v31.4s, v31.4s
+
         SQXTN   v16.4h, v16.4s
         SQXTN   v17.4h, v17.4s
         SQXTN   v18.4h, v18.4s
         SQXTN   v19.4h, v19.4s
+        SQXTN   v24.4h, v24.4s
+        SQXTN   v25.4h, v25.4s
+        SQXTN   v26.4h, v26.4s
+        SQXTN   v27.4h, v27.4s
         LD1R    {v2.8h}, [x11], 2       // add bias
 
-        SQXTN2  v4.8h,  v8.4s
-        SQXTN2  v5.8h,  v9.4s
-        SQXTN2  v6.8h, v10.4s
-        SQXTN2  v7.8h, v11.4s
         SQXTN2  v16.8h, v20.4s
         SQXTN2  v17.8h, v21.4s
         SQXTN2  v18.8h, v22.4s
         SQXTN2  v19.8h, v23.4s
+        SQXTN2  v24.8h, v28.4s
+        SQXTN2  v25.8h, v29.4s
+        SQXTN2  v26.8h, v30.4s
+        SQXTN2  v27.8h, v31.4s
 
-        SQADD   v4.8h,  v4.8h, v2.8h
-        SQADD   v5.8h,  v5.8h, v2.8h
-        SQADD   v6.8h,  v6.8h, v2.8h
-        SQADD   v7.8h,  v7.8h, v2.8h
         SQADD   v16.8h, v16.8h, v2.8h
         SQADD   v17.8h, v17.8h, v2.8h
         SQADD   v18.8h, v18.8h, v2.8h
         SQADD   v19.8h, v19.8h, v2.8h
+        SQADD   v24.8h, v24.8h, v2.8h
+        SQADD   v25.8h, v25.8h, v2.8h
+        SQADD   v26.8h, v26.8h, v2.8h
+        SQADD   v27.8h, v27.8h, v2.8h
         LD1R    {v0.16b}, [x11], 1      // clamp min value
 
-        SQXTN   v4.8b,  v4.8h
-        SQXTN   v5.8b,  v5.8h
-        SQXTN   v6.8b,  v6.8h
-        SQXTN   v7.8b,  v7.8h
+        SQXTN   v4.8b, v16.8h
+        SQXTN   v5.8b, v17.8h
+        SQXTN   v6.8b, v18.8h
+        SQXTN   v7.8b, v19.8h
         LD1R    {v1.16b}, [x11]         // clamp max value
-        SQXTN2  v4.16b, v16.8h
-        SQXTN2  v5.16b, v17.8h
-        SQXTN2  v6.16b, v18.8h
-        SQXTN2  v7.16b, v19.8h
+        SQXTN2  v4.16b, v24.8h
+        SQXTN2  v5.16b, v25.8h
+        SQXTN2  v6.16b, v26.8h
+        SQXTN2  v7.16b, v27.8h
         LDR     x12, [sp, 32]           // cn_stride
 
-        SMAX    v4.16b,  v4.16b, v0.16b
-        SMAX    v5.16b,  v5.16b, v0.16b
-        SMAX    v6.16b,  v6.16b, v0.16b
-        SMAX    v7.16b,  v7.16b, v0.16b
+        SMAX    v4.16b, v4.16b, v0.16b
+        SMAX    v5.16b, v5.16b, v0.16b
+        SMAX    v6.16b, v6.16b, v0.16b
+        SMAX    v7.16b, v7.16b, v0.16b
         SUBS    x1, x1, 16
-        SMIN    v4.16b,  v4.16b, v1.16b
-        SMIN    v5.16b,  v5.16b, v1.16b
-        SMIN    v6.16b,  v6.16b, v1.16b
-        SMIN    v7.16b,  v7.16b, v1.16b
+        SMIN    v4.16b, v4.16b, v1.16b
+        SMIN    v5.16b, v5.16b, v1.16b
+        SMIN    v6.16b, v6.16b, v1.16b
+        SMIN    v7.16b, v7.16b, v1.16b
         B.LO    6f
 
         # Store full 4 x 16
@@ -658,7 +721,7 @@
         LDP     d8,  d9, [sp], 32
         RET
 
-END_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55
+END_FUNCTION xnn_qs8_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_cortex_a55
 
 #ifdef __ELF__
 .section ".note.GNU-stack","",%progbits
diff --git a/src/qs8-gemm/4x16c4-aarch64-neondot-ld128.S.in b/src/qs8-gemm/4x16c4-aarch64-neondot-ld128.S.in
index 2ce8bfc..3cb9147 100644
--- a/src/qs8-gemm/4x16c4-aarch64-neondot-ld128.S.in
+++ b/src/qs8-gemm/4x16c4-aarch64-neondot-ld128.S.in
@@ -21,7 +21,7 @@
 #     const union ${CONV_PARAMS} params)  [sp + 8] -> x11
 
 $if REQUANTIZATION == "GEMMLOWP":
-  # params structure is 11 bytes
+  # params structure is 12 bytes
   #  struct {
   #    int32_t multiplier;
   #    int32_t right_shift;
diff --git a/src/qs8-gemm/4x16c4-aarch64-neondot-ld32.S.in b/src/qs8-gemm/4x16c4-aarch64-neondot-ld32.S.in
index b482c79..b843647 100644
--- a/src/qs8-gemm/4x16c4-aarch64-neondot-ld32.S.in
+++ b/src/qs8-gemm/4x16c4-aarch64-neondot-ld32.S.in
@@ -5,7 +5,10 @@
 
 #include <xnnpack/assembly.h>
 
-# void xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32(
+$assert REQUANTIZATION in ["GEMMLOWP", "FP32"]
+$CONV_PARAMS = "xnn_qs8_minmax_params"
+
+# void xnn_qs8_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_ld32(
 #     size_t mr,                 x0
 #     size_t nc,                 x1
 #     size_t kc,                 x2 / x0
@@ -15,7 +18,7 @@
 #     int8_t* restrict c,        x6
 #     size_t cm_stride,          x7
 #     size_t cn_stride,          [sp] -> x12
-#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x11
+#     const union ${CONV_PARAMS} params)  [sp + 8] -> x11
 
 # d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
 
@@ -31,7 +34,7 @@
 # C3  x7 v19 v23 v27 v31
 # unused v8 v9 v10 v11 v12 v13 v14 v15
 
-BEGIN_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32
+BEGIN_FUNCTION xnn_qs8_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_ld32
 
         # Clamp A and C pointers
         CMP     x0, 2                   // if mr < 2
@@ -63,7 +66,7 @@
         LDP     q24, q28, [x5], 32
         MOV     v19.16b, v16.16b
         MOV     v21.16b, v20.16b
-        LDR     x11, [sp, 8]            // params
+        LDR     x11, [sp, 8]            // reload params
         MOV     v22.16b, v20.16b
         MOV     v23.16b, v20.16b
         MOV     x0, x2                  // k = kc.  assumes kc > 0
@@ -104,86 +107,141 @@
         SDOT    v31.4s, v7.16b, v3.4b[0]
         B.HI    1b
 
-        # Apply params - scale, shift, bias and clamp
-        LD2R    {v0.4s, v1.4s}, [x11], 8
-        CMEQ    v2.4s, v1.4s, 0
+        $if REQUANTIZATION == "GEMMLOWP":
+          # Apply params - scale, shift, bias and clamp
+          LD2R    {v0.4s, v1.4s}, [x11], 8
+          CMEQ    v2.4s, v1.4s, 0
 
-        BIC     v4.16b, v16.16b, v2.16b
-        BIC     v5.16b, v17.16b, v2.16b
-        BIC     v6.16b, v18.16b, v2.16b
-        BIC     v7.16b, v19.16b, v2.16b
+          BIC     v4.16b, v16.16b, v2.16b
+          BIC     v5.16b, v17.16b, v2.16b
+          BIC     v6.16b, v18.16b, v2.16b
+          BIC     v7.16b, v19.16b, v2.16b
 
-        SQRDMULH v16.4s, v16.4s, v0.4s
-        SQRDMULH v17.4s, v17.4s, v0.4s
-        SQRDMULH v18.4s, v18.4s, v0.4s
-        SQRDMULH v19.4s, v19.4s, v0.4s
+          SQRDMULH v16.4s, v16.4s, v0.4s
+          SQRDMULH v17.4s, v17.4s, v0.4s
+          SQRDMULH v18.4s, v18.4s, v0.4s
+          SQRDMULH v19.4s, v19.4s, v0.4s
 
-        SSRA    v16.4s, v4.4s, 31       // signed shift right accumulate
-        SSRA    v17.4s, v5.4s, 31
-        SSRA    v18.4s, v6.4s, 31
-        SSRA    v19.4s, v7.4s, 31
+          SSRA    v16.4s, v4.4s, 31       // signed shift right accumulate
+          SSRA    v17.4s, v5.4s, 31
+          SSRA    v18.4s, v6.4s, 31
+          SSRA    v19.4s, v7.4s, 31
 
-        BIC     v4.16b, v20.16b, v2.16b
-        BIC     v5.16b, v21.16b, v2.16b
-        BIC     v6.16b, v22.16b, v2.16b
-        BIC     v7.16b, v23.16b, v2.16b
+          BIC     v4.16b, v20.16b, v2.16b
+          BIC     v5.16b, v21.16b, v2.16b
+          BIC     v6.16b, v22.16b, v2.16b
+          BIC     v7.16b, v23.16b, v2.16b
 
-        SQRDMULH v20.4s, v20.4s, v0.4s
-        SQRDMULH v21.4s, v21.4s, v0.4s
-        SQRDMULH v22.4s, v22.4s, v0.4s
-        SQRDMULH v23.4s, v23.4s, v0.4s
+          SQRDMULH v20.4s, v20.4s, v0.4s
+          SQRDMULH v21.4s, v21.4s, v0.4s
+          SQRDMULH v22.4s, v22.4s, v0.4s
+          SQRDMULH v23.4s, v23.4s, v0.4s
 
-        SSRA    v20.4s, v4.4s, 31
-        SSRA    v21.4s, v5.4s, 31
-        SSRA    v22.4s, v6.4s, 31
-        SSRA    v23.4s, v7.4s, 31
+          SSRA    v20.4s, v4.4s, 31
+          SSRA    v21.4s, v5.4s, 31
+          SSRA    v22.4s, v6.4s, 31
+          SSRA    v23.4s, v7.4s, 31
 
-        BIC     v4.16b, v24.16b, v2.16b
-        BIC     v5.16b, v25.16b, v2.16b
-        BIC     v6.16b, v26.16b, v2.16b
-        BIC     v7.16b, v27.16b, v2.16b
+          BIC     v4.16b, v24.16b, v2.16b
+          BIC     v5.16b, v25.16b, v2.16b
+          BIC     v6.16b, v26.16b, v2.16b
+          BIC     v7.16b, v27.16b, v2.16b
 
-        SQRDMULH v24.4s, v24.4s, v0.4s
-        SQRDMULH v25.4s, v25.4s, v0.4s
-        SQRDMULH v26.4s, v26.4s, v0.4s
-        SQRDMULH v27.4s, v27.4s, v0.4s
+          SQRDMULH v24.4s, v24.4s, v0.4s
+          SQRDMULH v25.4s, v25.4s, v0.4s
+          SQRDMULH v26.4s, v26.4s, v0.4s
+          SQRDMULH v27.4s, v27.4s, v0.4s
 
-        SSRA    v24.4s, v4.4s, 31
-        SSRA    v25.4s, v5.4s, 31
-        SSRA    v26.4s, v6.4s, 31
-        SSRA    v27.4s, v7.4s, 31
+          SSRA    v24.4s, v4.4s, 31
+          SSRA    v25.4s, v5.4s, 31
+          SSRA    v26.4s, v6.4s, 31
+          SSRA    v27.4s, v7.4s, 31
 
-        BIC     v4.16b, v28.16b, v2.16b
-        BIC     v5.16b, v29.16b, v2.16b
-        BIC     v6.16b, v30.16b, v2.16b
-        BIC     v7.16b, v31.16b, v2.16b
+          BIC     v4.16b, v28.16b, v2.16b
+          BIC     v5.16b, v29.16b, v2.16b
+          BIC     v6.16b, v30.16b, v2.16b
+          BIC     v7.16b, v31.16b, v2.16b
 
-        SQRDMULH v28.4s, v28.4s, v0.4s
-        SQRDMULH v29.4s, v29.4s, v0.4s
-        SQRDMULH v30.4s, v30.4s, v0.4s
-        SQRDMULH v31.4s, v31.4s, v0.4s
+          SQRDMULH v28.4s, v28.4s, v0.4s
+          SQRDMULH v29.4s, v29.4s, v0.4s
+          SQRDMULH v30.4s, v30.4s, v0.4s
+          SQRDMULH v31.4s, v31.4s, v0.4s
 
-        SSRA    v28.4s, v4.4s, 31
-        SSRA    v29.4s, v5.4s, 31
-        SSRA    v30.4s, v6.4s, 31
-        SSRA    v31.4s, v7.4s, 31
+          SSRA    v28.4s, v4.4s, 31
+          SSRA    v29.4s, v5.4s, 31
+          SSRA    v30.4s, v6.4s, 31
+          SSRA    v31.4s, v7.4s, 31
 
-        SRSHL   v16.4s, v16.4s, v1.4s   // signed rounding shift left
-        SRSHL   v17.4s, v17.4s, v1.4s
-        SRSHL   v18.4s, v18.4s, v1.4s
-        SRSHL   v19.4s, v19.4s, v1.4s
-        SRSHL   v20.4s, v20.4s, v1.4s
-        SRSHL   v21.4s, v21.4s, v1.4s
-        SRSHL   v22.4s, v22.4s, v1.4s
-        SRSHL   v23.4s, v23.4s, v1.4s
-        SRSHL   v24.4s, v24.4s, v1.4s
-        SRSHL   v25.4s, v25.4s, v1.4s
-        SRSHL   v26.4s, v26.4s, v1.4s
-        SRSHL   v27.4s, v27.4s, v1.4s
-        SRSHL   v28.4s, v28.4s, v1.4s
-        SRSHL   v29.4s, v29.4s, v1.4s
-        SRSHL   v30.4s, v30.4s, v1.4s
-        SRSHL   v31.4s, v31.4s, v1.4s
+          SRSHL   v16.4s, v16.4s, v1.4s   // signed rounding shift left
+          SRSHL   v17.4s, v17.4s, v1.4s
+          SRSHL   v18.4s, v18.4s, v1.4s
+          SRSHL   v19.4s, v19.4s, v1.4s
+          SRSHL   v20.4s, v20.4s, v1.4s
+          SRSHL   v21.4s, v21.4s, v1.4s
+          SRSHL   v22.4s, v22.4s, v1.4s
+          SRSHL   v23.4s, v23.4s, v1.4s
+          SRSHL   v24.4s, v24.4s, v1.4s
+          SRSHL   v25.4s, v25.4s, v1.4s
+          SRSHL   v26.4s, v26.4s, v1.4s
+          SRSHL   v27.4s, v27.4s, v1.4s
+          SRSHL   v28.4s, v28.4s, v1.4s
+          SRSHL   v29.4s, v29.4s, v1.4s
+          SRSHL   v30.4s, v30.4s, v1.4s
+          SRSHL   v31.4s, v31.4s, v1.4s
+        $elif REQUANTIZATION == "FP32":
+          # Apply params - scale, bias and clamp
+          LD1R    {v0.4s}, [x11], 4
+
+          SCVTF   v16.4s, v16.4s
+          SCVTF   v17.4s, v17.4s
+          SCVTF   v18.4s, v18.4s
+          SCVTF   v19.4s, v19.4s
+          SCVTF   v20.4s, v20.4s
+          SCVTF   v21.4s, v21.4s
+          SCVTF   v22.4s, v22.4s
+          SCVTF   v23.4s, v23.4s
+          SCVTF   v24.4s, v24.4s
+          SCVTF   v25.4s, v25.4s
+          SCVTF   v26.4s, v26.4s
+          SCVTF   v27.4s, v27.4s
+          SCVTF   v28.4s, v28.4s
+          SCVTF   v29.4s, v29.4s
+          SCVTF   v30.4s, v30.4s
+          SCVTF   v31.4s, v31.4s
+
+          FMUL    v16.4s, v0.4s, v16.4s
+          FMUL    v17.4s, v0.4s, v17.4s
+          FMUL    v18.4s, v0.4s, v18.4s
+          FMUL    v19.4s, v0.4s, v19.4s
+          FMUL    v20.4s, v0.4s, v20.4s
+          FMUL    v21.4s, v0.4s, v21.4s
+          FMUL    v22.4s, v0.4s, v22.4s
+          FMUL    v23.4s, v0.4s, v23.4s
+          FMUL    v24.4s, v0.4s, v24.4s
+          FMUL    v25.4s, v0.4s, v25.4s
+          FMUL    v26.4s, v0.4s, v26.4s
+          FMUL    v27.4s, v0.4s, v27.4s
+          FMUL    v28.4s, v0.4s, v28.4s
+          FMUL    v29.4s, v0.4s, v29.4s
+          FMUL    v30.4s, v0.4s, v30.4s
+          FMUL    v31.4s, v0.4s, v31.4s
+
+          FCVTNS  v16.4s, v16.4s
+          FCVTNS  v17.4s, v17.4s
+          FCVTNS  v18.4s, v18.4s
+          FCVTNS  v19.4s, v19.4s
+          FCVTNS  v20.4s, v20.4s
+          FCVTNS  v21.4s, v21.4s
+          FCVTNS  v22.4s, v22.4s
+          FCVTNS  v23.4s, v23.4s
+          FCVTNS  v24.4s, v24.4s
+          FCVTNS  v25.4s, v25.4s
+          FCVTNS  v26.4s, v26.4s
+          FCVTNS  v27.4s, v27.4s
+          FCVTNS  v28.4s, v28.4s
+          FCVTNS  v29.4s, v29.4s
+          FCVTNS  v30.4s, v30.4s
+          FCVTNS  v31.4s, v31.4s
 
         SQXTN   v16.4h, v16.4s
         SQXTN   v17.4h, v17.4s
@@ -289,7 +347,7 @@
 6:
         RET
 
-END_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32
+END_FUNCTION xnn_qs8_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_ld32
 
 #ifdef __ELF__
 .section ".note.GNU-stack","",%progbits
diff --git a/src/qs8-gemm/4x16c4-aarch64-neondot-ld64.S.in b/src/qs8-gemm/4x16c4-aarch64-neondot-ld64.S.in
index a87d160..c0ba8d8 100644
--- a/src/qs8-gemm/4x16c4-aarch64-neondot-ld64.S.in
+++ b/src/qs8-gemm/4x16c4-aarch64-neondot-ld64.S.in
@@ -5,7 +5,10 @@
 
 #include <xnnpack/assembly.h>
 
-# void xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64(
+$assert REQUANTIZATION in ["GEMMLOWP", "FP32"]
+$CONV_PARAMS = "xnn_qs8_minmax_params"
+
+# void xnn_qs8_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_ld64(
 #     size_t mr,                 x0
 #     size_t nc,                 x1
 #     size_t kc,                 x2 / x0
@@ -15,7 +18,7 @@
 #     int8_t* restrict c,        x6
 #     size_t cm_stride,          x7
 #     size_t cn_stride,          [sp] -> x12
-#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x11
+#     const union ${CONV_PARAMS} params)  [sp + 8] -> x11
 
 # d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
 
@@ -31,7 +34,7 @@
 # C3  x7 v19 v23 v27 v31
 # unused v8 v9 v10 v11 v12 v13 v14 v15
 
-BEGIN_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64
+BEGIN_FUNCTION xnn_qs8_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_ld64
 
         # Clamp A and C pointers
         CMP     x0, 2                   // if mr < 2
@@ -127,86 +130,141 @@
         TBNZ    x0, 2, 3f
 
 2:
-        # Apply params - scale, shift, bias and clamp
-        LD2R    {v0.4s, v1.4s}, [x11], 8
-        CMEQ    v2.4s, v1.4s, 0
+        $if REQUANTIZATION == "GEMMLOWP":
+          # Apply params - scale, shift, bias and clamp
+          LD2R    {v0.4s, v1.4s}, [x11], 8
+          CMEQ    v2.4s, v1.4s, 0
 
-        BIC     v4.16b, v16.16b, v2.16b
-        BIC     v5.16b, v17.16b, v2.16b
-        BIC     v6.16b, v18.16b, v2.16b
-        BIC     v7.16b, v19.16b, v2.16b
+          BIC     v4.16b, v16.16b, v2.16b
+          BIC     v5.16b, v17.16b, v2.16b
+          BIC     v6.16b, v18.16b, v2.16b
+          BIC     v7.16b, v19.16b, v2.16b
 
-        SQRDMULH v16.4s, v16.4s, v0.4s
-        SQRDMULH v17.4s, v17.4s, v0.4s
-        SQRDMULH v18.4s, v18.4s, v0.4s
-        SQRDMULH v19.4s, v19.4s, v0.4s
+          SQRDMULH v16.4s, v16.4s, v0.4s
+          SQRDMULH v17.4s, v17.4s, v0.4s
+          SQRDMULH v18.4s, v18.4s, v0.4s
+          SQRDMULH v19.4s, v19.4s, v0.4s
 
-        SSRA    v16.4s, v4.4s, 31       // signed shift right accumulate
-        SSRA    v17.4s, v5.4s, 31
-        SSRA    v18.4s, v6.4s, 31
-        SSRA    v19.4s, v7.4s, 31
+          SSRA    v16.4s, v4.4s, 31       // signed shift right accumulate
+          SSRA    v17.4s, v5.4s, 31
+          SSRA    v18.4s, v6.4s, 31
+          SSRA    v19.4s, v7.4s, 31
 
-        BIC     v4.16b, v20.16b, v2.16b
-        BIC     v5.16b, v21.16b, v2.16b
-        BIC     v6.16b, v22.16b, v2.16b
-        BIC     v7.16b, v23.16b, v2.16b
+          BIC     v4.16b, v20.16b, v2.16b
+          BIC     v5.16b, v21.16b, v2.16b
+          BIC     v6.16b, v22.16b, v2.16b
+          BIC     v7.16b, v23.16b, v2.16b
 
-        SQRDMULH v20.4s, v20.4s, v0.4s
-        SQRDMULH v21.4s, v21.4s, v0.4s
-        SQRDMULH v22.4s, v22.4s, v0.4s
-        SQRDMULH v23.4s, v23.4s, v0.4s
+          SQRDMULH v20.4s, v20.4s, v0.4s
+          SQRDMULH v21.4s, v21.4s, v0.4s
+          SQRDMULH v22.4s, v22.4s, v0.4s
+          SQRDMULH v23.4s, v23.4s, v0.4s
 
-        SSRA    v20.4s, v4.4s, 31
-        SSRA    v21.4s, v5.4s, 31
-        SSRA    v22.4s, v6.4s, 31
-        SSRA    v23.4s, v7.4s, 31
+          SSRA    v20.4s, v4.4s, 31
+          SSRA    v21.4s, v5.4s, 31
+          SSRA    v22.4s, v6.4s, 31
+          SSRA    v23.4s, v7.4s, 31
 
-        BIC     v4.16b, v24.16b, v2.16b
-        BIC     v5.16b, v25.16b, v2.16b
-        BIC     v6.16b, v26.16b, v2.16b
-        BIC     v7.16b, v27.16b, v2.16b
+          BIC     v4.16b, v24.16b, v2.16b
+          BIC     v5.16b, v25.16b, v2.16b
+          BIC     v6.16b, v26.16b, v2.16b
+          BIC     v7.16b, v27.16b, v2.16b
 
-        SQRDMULH v24.4s, v24.4s, v0.4s
-        SQRDMULH v25.4s, v25.4s, v0.4s
-        SQRDMULH v26.4s, v26.4s, v0.4s
-        SQRDMULH v27.4s, v27.4s, v0.4s
+          SQRDMULH v24.4s, v24.4s, v0.4s
+          SQRDMULH v25.4s, v25.4s, v0.4s
+          SQRDMULH v26.4s, v26.4s, v0.4s
+          SQRDMULH v27.4s, v27.4s, v0.4s
 
-        SSRA    v24.4s, v4.4s, 31
-        SSRA    v25.4s, v5.4s, 31
-        SSRA    v26.4s, v6.4s, 31
-        SSRA    v27.4s, v7.4s, 31
+          SSRA    v24.4s, v4.4s, 31
+          SSRA    v25.4s, v5.4s, 31
+          SSRA    v26.4s, v6.4s, 31
+          SSRA    v27.4s, v7.4s, 31
 
-        BIC     v4.16b, v28.16b, v2.16b
-        BIC     v5.16b, v29.16b, v2.16b
-        BIC     v6.16b, v30.16b, v2.16b
-        BIC     v7.16b, v31.16b, v2.16b
+          BIC     v4.16b, v28.16b, v2.16b
+          BIC     v5.16b, v29.16b, v2.16b
+          BIC     v6.16b, v30.16b, v2.16b
+          BIC     v7.16b, v31.16b, v2.16b
 
-        SQRDMULH v28.4s, v28.4s, v0.4s
-        SQRDMULH v29.4s, v29.4s, v0.4s
-        SQRDMULH v30.4s, v30.4s, v0.4s
-        SQRDMULH v31.4s, v31.4s, v0.4s
+          SQRDMULH v28.4s, v28.4s, v0.4s
+          SQRDMULH v29.4s, v29.4s, v0.4s
+          SQRDMULH v30.4s, v30.4s, v0.4s
+          SQRDMULH v31.4s, v31.4s, v0.4s
 
-        SSRA    v28.4s, v4.4s, 31
-        SSRA    v29.4s, v5.4s, 31
-        SSRA    v30.4s, v6.4s, 31
-        SSRA    v31.4s, v7.4s, 31
+          SSRA    v28.4s, v4.4s, 31
+          SSRA    v29.4s, v5.4s, 31
+          SSRA    v30.4s, v6.4s, 31
+          SSRA    v31.4s, v7.4s, 31
 
-        SRSHL   v16.4s, v16.4s, v1.4s   // signed rounding shift left
-        SRSHL   v17.4s, v17.4s, v1.4s
-        SRSHL   v18.4s, v18.4s, v1.4s
-        SRSHL   v19.4s, v19.4s, v1.4s
-        SRSHL   v20.4s, v20.4s, v1.4s
-        SRSHL   v21.4s, v21.4s, v1.4s
-        SRSHL   v22.4s, v22.4s, v1.4s
-        SRSHL   v23.4s, v23.4s, v1.4s
-        SRSHL   v24.4s, v24.4s, v1.4s
-        SRSHL   v25.4s, v25.4s, v1.4s
-        SRSHL   v26.4s, v26.4s, v1.4s
-        SRSHL   v27.4s, v27.4s, v1.4s
-        SRSHL   v28.4s, v28.4s, v1.4s
-        SRSHL   v29.4s, v29.4s, v1.4s
-        SRSHL   v30.4s, v30.4s, v1.4s
-        SRSHL   v31.4s, v31.4s, v1.4s
+          SRSHL   v16.4s, v16.4s, v1.4s   // signed rounding shift left
+          SRSHL   v17.4s, v17.4s, v1.4s
+          SRSHL   v18.4s, v18.4s, v1.4s
+          SRSHL   v19.4s, v19.4s, v1.4s
+          SRSHL   v20.4s, v20.4s, v1.4s
+          SRSHL   v21.4s, v21.4s, v1.4s
+          SRSHL   v22.4s, v22.4s, v1.4s
+          SRSHL   v23.4s, v23.4s, v1.4s
+          SRSHL   v24.4s, v24.4s, v1.4s
+          SRSHL   v25.4s, v25.4s, v1.4s
+          SRSHL   v26.4s, v26.4s, v1.4s
+          SRSHL   v27.4s, v27.4s, v1.4s
+          SRSHL   v28.4s, v28.4s, v1.4s
+          SRSHL   v29.4s, v29.4s, v1.4s
+          SRSHL   v30.4s, v30.4s, v1.4s
+          SRSHL   v31.4s, v31.4s, v1.4s
+        $elif REQUANTIZATION == "FP32":
+          # Apply params - scale, bias and clamp
+          LD1R    {v0.4s}, [x11], 4
+
+          SCVTF   v16.4s, v16.4s
+          SCVTF   v17.4s, v17.4s
+          SCVTF   v18.4s, v18.4s
+          SCVTF   v19.4s, v19.4s
+          SCVTF   v20.4s, v20.4s
+          SCVTF   v21.4s, v21.4s
+          SCVTF   v22.4s, v22.4s
+          SCVTF   v23.4s, v23.4s
+          SCVTF   v24.4s, v24.4s
+          SCVTF   v25.4s, v25.4s
+          SCVTF   v26.4s, v26.4s
+          SCVTF   v27.4s, v27.4s
+          SCVTF   v28.4s, v28.4s
+          SCVTF   v29.4s, v29.4s
+          SCVTF   v30.4s, v30.4s
+          SCVTF   v31.4s, v31.4s
+
+          FMUL    v16.4s, v0.4s, v16.4s
+          FMUL    v17.4s, v0.4s, v17.4s
+          FMUL    v18.4s, v0.4s, v18.4s
+          FMUL    v19.4s, v0.4s, v19.4s
+          FMUL    v20.4s, v0.4s, v20.4s
+          FMUL    v21.4s, v0.4s, v21.4s
+          FMUL    v22.4s, v0.4s, v22.4s
+          FMUL    v23.4s, v0.4s, v23.4s
+          FMUL    v24.4s, v0.4s, v24.4s
+          FMUL    v25.4s, v0.4s, v25.4s
+          FMUL    v26.4s, v0.4s, v26.4s
+          FMUL    v27.4s, v0.4s, v27.4s
+          FMUL    v28.4s, v0.4s, v28.4s
+          FMUL    v29.4s, v0.4s, v29.4s
+          FMUL    v30.4s, v0.4s, v30.4s
+          FMUL    v31.4s, v0.4s, v31.4s
+
+          FCVTNS  v16.4s, v16.4s
+          FCVTNS  v17.4s, v17.4s
+          FCVTNS  v18.4s, v18.4s
+          FCVTNS  v19.4s, v19.4s
+          FCVTNS  v20.4s, v20.4s
+          FCVTNS  v21.4s, v21.4s
+          FCVTNS  v22.4s, v22.4s
+          FCVTNS  v23.4s, v23.4s
+          FCVTNS  v24.4s, v24.4s
+          FCVTNS  v25.4s, v25.4s
+          FCVTNS  v26.4s, v26.4s
+          FCVTNS  v27.4s, v27.4s
+          FCVTNS  v28.4s, v28.4s
+          FCVTNS  v29.4s, v29.4s
+          FCVTNS  v30.4s, v30.4s
+          FCVTNS  v31.4s, v31.4s
 
         SQXTN   v16.4h, v16.4s
         SQXTN   v17.4h, v17.4s
@@ -340,7 +398,7 @@
 8:
         RET
 
-END_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64
+END_FUNCTION xnn_qs8_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_ld64
 
 #ifdef __ELF__
 .section ".note.GNU-stack","",%progbits
diff --git a/src/qs8-gemm/gen/1x16c4-minmax-fp32-aarch64-neondot-ld32.S b/src/qs8-gemm/gen/1x16c4-minmax-fp32-aarch64-neondot-ld32.S
new file mode 100644
index 0000000..598355e
--- /dev/null
+++ b/src/qs8-gemm/gen/1x16c4-minmax-fp32-aarch64-neondot-ld32.S
@@ -0,0 +1,122 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/1x16c4-aarch64-neondot-ld32.S.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <xnnpack/assembly.h>
+
+
+# void xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32(
+#     size_t mr,                 x0
+#     size_t nc,                 x1
+#     size_t kc,                 x2 / x0
+#     const int8_t* restrict a,  x3
+#     size_t a_stride,           (x4)
+#     const void* restrict w,    x5
+#     int8_t* restrict c,        x6
+#     size_t cm_stride,          (x7)
+#     size_t cn_stride,          [sp] -> x12
+#     const union xnn_qs8_minmax_params params)  [sp + 8] -> x11
+
+# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+# Register usage
+# A0  x3 v0
+# B   x5 v16 v17 v18 v19
+# C0  x6 v28 v29 v30 v31
+# unused v4 v5 v6 v7 v8 v9 v10 v11 v12 v13 v14 v15
+
+BEGIN_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32
+0:
+        # Load initial bias from w into accumulators
+        ADD     x2, x2, 3               // kc = (kc + 3) & ~3
+        LDP     q28, q29, [x5], 32
+        BIC     x2, x2, 3
+        LDP     q30, q31, [x5], 32
+        MOV     x0, x2                  // k = kc.  assumes kc > 0
+        LDR     x11, [sp, 8]            // params
+
+        # Main loop - 4 bytes of A
+        .p2align 3
+1:
+        LDR     s0,  [x3], 4
+        LDR     q16, [x5], 16
+        LDR     q17, [x5], 16
+        LDR     q18, [x5], 16
+        LDR     q19, [x5], 16
+        SDOT    v28.4s, v16.16b, v0.4b[0]
+        SDOT    v29.4s, v17.16b, v0.4b[0]
+        SUBS    x0, x0, 4
+        SDOT    v30.4s, v18.16b, v0.4b[0]
+        SDOT    v31.4s, v19.16b, v0.4b[0]
+        B.HI    1b
+
+        # Apply params - scale, bias and clamp
+        LD1R    {v0.4s}, [x11], 4
+
+        SCVTF   v28.4s, v28.4s
+        SCVTF   v29.4s, v29.4s
+        SCVTF   v30.4s, v30.4s
+        SCVTF   v31.4s, v31.4s
+
+        FMUL    v28.4s, v0.4s, v28.4s
+        FMUL    v29.4s, v0.4s, v29.4s
+        FMUL    v30.4s, v0.4s, v30.4s
+        FMUL    v31.4s, v0.4s, v31.4s
+
+        FCVTNS  v28.4s, v28.4s
+        FCVTNS  v29.4s, v29.4s
+        FCVTNS  v30.4s, v30.4s
+        FCVTNS  v31.4s, v31.4s
+
+        LD1R    {v2.8h}, [x11], 2       // add bias
+        SQXTN   v4.4h, v28.4s
+        SQXTN   v6.4h, v30.4s
+        SQXTN2  v4.8h, v29.4s
+        SQXTN2  v6.8h, v31.4s
+        LD2R    {v0.16b, v1.16b}, [x11]   // clamp to min/max
+        SQADD   v4.8h, v4.8h, v2.8h
+        SQADD   v6.8h, v6.8h, v2.8h
+        LDR     x12, [sp]               // cn_stride
+        SQXTN   v4.8b, v4.8h
+        SQXTN2  v4.16b, v6.8h
+        SUBS    x1, x1, 16
+        SMAX    v4.16b, v4.16b, v0.16b
+        SMIN    v4.16b, v4.16b, v1.16b
+        B.LO    2f
+
+        # Store full 1 x 16
+        ST1     {v4.16b}, [x6], x12
+        SUB     x3,  x3, x2             // a0 -= kc
+        B.NE    0b
+        RET
+
+        # Store odd width
+        .p2align 3
+2:
+        TBZ     x1, 3, 3f
+        STR     d4, [x6], 8
+        DUP     d4, v4.d[1]
+3:
+        TBZ     x1, 2, 4f
+        STR     s4, [x6], 4
+        DUP     s4, v4.s[1]
+4:
+        TBZ     x1, 1, 5f
+        ST1     {v4.h}[0], [x6], 2
+        DUP     h4, v4.h[1]
+5:
+        TBZ     x1, 0, 6f
+        ST1     {v4.b}[0], [x6]
+6:
+        RET
+
+END_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/qs8-gemm/gen/1x16c4-minmax-fp32-aarch64-neondot-ld64.S b/src/qs8-gemm/gen/1x16c4-minmax-fp32-aarch64-neondot-ld64.S
new file mode 100644
index 0000000..2714f37
--- /dev/null
+++ b/src/qs8-gemm/gen/1x16c4-minmax-fp32-aarch64-neondot-ld64.S
@@ -0,0 +1,158 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/1x16c4-aarch64-neondot-ld64.S.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <xnnpack/assembly.h>
+
+
+# void xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64(
+#     size_t mr,                 x0
+#     size_t nc,                 x1
+#     size_t kc,                 x2 / x0
+#     const int8_t* restrict a,  x3
+#     size_t a_stride,           (x4)
+#     const void* restrict w,    x5
+#     int8_t* restrict c,        x6
+#     size_t cm_stride,          (x7)
+#     size_t cn_stride,          [sp] -> x12
+#     const union xnn_qs8_minmax_params params)  [sp + 8] -> x11
+
+# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+# Register usage
+# A0  x3 v0
+# B   x5 v4  v5  v6  v7  v16  v17 v18 v19
+# C0  x6 v28 v29 v30 v31
+# unused v8 v9 v10 v11 v12 v13 v14 v15
+
+BEGIN_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64
+        ADD     x2, x2, 3               // kc = (kc + 3) & ~3
+        BIC     x2, x2, 3
+
+        .p2align 3
+0:
+        # Load initial bias from w into accumulators
+        LDP     q28, q29, [x5], 32
+        SUBS    x0, x2, 8               // k = kc - 8
+        LDP     q30, q31, [x5], 32
+        LDR     x11, [sp, 8]            // params
+
+        # Is there at least 8 bytes?
+        B.LO    3f
+
+        # Main loop - 8 bytes of A
+        .p2align 3
+1:
+        LDR     d0,  [x3], 8
+        LDR     q16, [x5, 0]
+        LDR     q17, [x5, 16]
+        SDOT    v28.4s, v16.16b, v0.4b[0]
+        LDR     q18, [x5, 32]
+        SDOT    v29.4s, v17.16b, v0.4b[0]
+        LDR     q19, [x5, 48]
+        SDOT    v30.4s, v18.16b, v0.4b[0]
+        LDR     q4, [x5, 64]
+        SDOT    v31.4s, v19.16b, v0.4b[0]
+        LDR     q5, [x5, 80]
+        SDOT    v28.4s, v4.16b,  v0.4b[1]
+        LDR     q6, [x5, 96]
+        SDOT    v29.4s, v5.16b,  v0.4b[1]
+        LDR     q7, [x5, 112]
+        SDOT    v30.4s, v6.16b,  v0.4b[1]
+        ADD     x5, x5, 128
+        SDOT    v31.4s, v7.16b,  v0.4b[1]
+        SUBS    x0, x0, 8
+        B.HS    1b
+
+        # Is there a remainder?- 1 to 4 bytes of A
+        TBNZ    x0, 2, 3f
+
+2:
+
+        # Apply params - scale, bias and clamp
+        LD1R    {v0.4s}, [x11], 4
+
+        SCVTF   v28.4s, v28.4s
+        SCVTF   v29.4s, v29.4s
+        SCVTF   v30.4s, v30.4s
+        SCVTF   v31.4s, v31.4s
+
+        FMUL    v28.4s, v0.4s, v28.4s
+        FMUL    v29.4s, v0.4s, v29.4s
+        FMUL    v30.4s, v0.4s, v30.4s
+        FMUL    v31.4s, v0.4s, v31.4s
+
+        FCVTNS  v28.4s, v28.4s
+        FCVTNS  v29.4s, v29.4s
+        FCVTNS  v30.4s, v30.4s
+        FCVTNS  v31.4s, v31.4s
+
+        LD1R    {v2.8h}, [x11], 2       // add bias
+        SQXTN   v4.4h, v28.4s
+        SQXTN   v6.4h, v30.4s
+        SQXTN2  v4.8h, v29.4s
+        SQXTN2  v6.8h, v31.4s
+
+        LD2R    {v0.16b, v1.16b}, [x11]   // clamp to min/max
+        SQADD   v4.8h, v4.8h, v2.8h
+        SQADD   v6.8h, v6.8h, v2.8h
+        LDR     x12, [sp]               // cn_stride
+        SQXTN   v4.8b, v4.8h
+        SQXTN2  v4.16b, v6.8h
+        SUBS    x1, x1, 16
+        SMAX    v4.16b, v4.16b, v0.16b
+        SMIN    v4.16b, v4.16b, v1.16b
+        B.LO    4f
+
+        # Store full 1 x 16
+        ST1     {v4.16b}, [x6], x12
+        SUB     x3,  x3, x2             // a0 -= kc
+        B.NE    0b
+
+        RET
+
+        # Remainder - 4 bytes of A
+        .p2align 3
+3:
+        LDR     s0,  [x3], 4
+        LDR     q16, [x5, 0]
+        LDR     q17, [x5, 16]
+        SDOT    v28.4s, v16.16b, v0.4b[0]
+        LDR     q18, [x5, 32]
+        SDOT    v29.4s, v17.16b, v0.4b[0]
+        LDR     q19, [x5, 48]
+        SDOT    v30.4s, v18.16b, v0.4b[0]
+        ADD     x5, x5, 64
+        SDOT    v31.4s, v19.16b, v0.4b[0]
+        B       2b
+
+        # Store odd width
+        .p2align 3
+4:
+        TBZ     x1, 3, 5f
+        STR     d4, [x6], 8
+        DUP     d4, v4.d[1]
+5:
+        TBZ     x1, 2, 6f
+        STR     s4, [x6], 4
+        DUP     s4, v4.s[1]
+6:
+        TBZ     x1, 1, 7f
+        ST1     {v4.h}[0], [x6], 2
+        DUP     h4, v4.h[1]
+7:
+        TBZ     x1, 0, 8f
+        ST1     {v4.b}[0], [x6]
+8:
+        RET
+
+END_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/qs8-gemm/gen/1x16c4-minmax-gemmlowp-aarch64-neondot-ld32.S b/src/qs8-gemm/gen/1x16c4-minmax-gemmlowp-aarch64-neondot-ld32.S
index 0187c61..c2dac5d 100644
--- a/src/qs8-gemm/gen/1x16c4-minmax-gemmlowp-aarch64-neondot-ld32.S
+++ b/src/qs8-gemm/gen/1x16c4-minmax-gemmlowp-aarch64-neondot-ld32.S
@@ -9,6 +9,7 @@
 
 #include <xnnpack/assembly.h>
 
+
 # void xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32(
 #     size_t mr,                 x0
 #     size_t nc,                 x1
@@ -19,7 +20,7 @@
 #     int8_t* restrict c,        x6
 #     size_t cm_stride,          (x7)
 #     size_t cn_stride,          [sp] -> x12
-#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x11
+#     const union xnn_qs8_minmax_params params)  [sp + 8] -> x11
 
 # d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
 
@@ -57,27 +58,32 @@
         # Apply params - scale, shift, bias and clamp
         LD2R    {v0.4s, v1.4s}, [x11], 8
         CMEQ    v2.4s, v1.4s, 0
-        SQRDMULH v4.4s, v28.4s, v0.4s
-        SQRDMULH v5.4s, v29.4s, v0.4s
-        SQRDMULH v6.4s, v30.4s, v0.4s
-        SQRDMULH v7.4s, v31.4s, v0.4s
-        BIC     v28.16b, v28.16b, v2.16b
-        BIC     v29.16b, v29.16b, v2.16b
-        BIC     v30.16b, v30.16b, v2.16b
-        BIC     v31.16b, v31.16b, v2.16b
-        SSRA    v4.4s, v28.4s, 31       // signed shift right accumulate
-        SSRA    v5.4s, v29.4s, 31
-        SSRA    v6.4s, v30.4s, 31
-        SSRA    v7.4s, v31.4s, 31
-        SRSHL   v4.4s, v4.4s, v1.4s     // signed rounding shift left
-        SRSHL   v5.4s, v5.4s, v1.4s
-        SRSHL   v6.4s, v6.4s, v1.4s
-        SRSHL   v7.4s, v7.4s, v1.4s
+
+        BIC     v4.16b, v28.16b, v2.16b
+        BIC     v5.16b, v29.16b, v2.16b
+        BIC     v6.16b, v30.16b, v2.16b
+        BIC     v7.16b, v31.16b, v2.16b
+
+        SQRDMULH v28.4s, v28.4s, v0.4s
+        SQRDMULH v29.4s, v29.4s, v0.4s
+        SQRDMULH v30.4s, v30.4s, v0.4s
+        SQRDMULH v31.4s, v31.4s, v0.4s
+
+        SSRA    v28.4s, v4.4s, 31
+        SSRA    v29.4s, v5.4s, 31
+        SSRA    v30.4s, v6.4s, 31
+        SSRA    v31.4s, v7.4s, 31
+
+        SRSHL   v28.4s, v28.4s, v1.4s   // signed rounding shift left
+        SRSHL   v29.4s, v29.4s, v1.4s
+        SRSHL   v30.4s, v30.4s, v1.4s
+        SRSHL   v31.4s, v31.4s, v1.4s
+
         LD1R    {v2.8h}, [x11], 2       // add bias
-        SQXTN   v4.4h, v4.4s
-        SQXTN   v6.4h, v6.4s
-        SQXTN2  v4.8h, v5.4s
-        SQXTN2  v6.8h, v7.4s
+        SQXTN   v4.4h, v28.4s
+        SQXTN   v6.4h, v30.4s
+        SQXTN2  v4.8h, v29.4s
+        SQXTN2  v6.8h, v31.4s
         LD2R    {v0.16b, v1.16b}, [x11]   // clamp to min/max
         SQADD   v4.8h, v4.8h, v2.8h
         SQADD   v6.8h, v6.8h, v2.8h
diff --git a/src/qs8-gemm/gen/1x16c4-minmax-gemmlowp-aarch64-neondot-ld64.S b/src/qs8-gemm/gen/1x16c4-minmax-gemmlowp-aarch64-neondot-ld64.S
index 6c88bcd..bcb7064 100644
--- a/src/qs8-gemm/gen/1x16c4-minmax-gemmlowp-aarch64-neondot-ld64.S
+++ b/src/qs8-gemm/gen/1x16c4-minmax-gemmlowp-aarch64-neondot-ld64.S
@@ -9,6 +9,7 @@
 
 #include <xnnpack/assembly.h>
 
+
 # void xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64(
 #     size_t mr,                 x0
 #     size_t nc,                 x1
@@ -19,7 +20,7 @@
 #     int8_t* restrict c,        x6
 #     size_t cm_stride,          (x7)
 #     size_t cn_stride,          [sp] -> x12
-#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x11
+#     const union xnn_qs8_minmax_params params)  [sp + 8] -> x11
 
 # d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
 
@@ -72,30 +73,37 @@
         TBNZ    x0, 2, 3f
 
 2:
-         # Apply params - scale, shift, bias and clamp
+
+        # Apply params - scale, shift, bias and clamp
         LD2R    {v0.4s, v1.4s}, [x11], 8
-        SQRDMULH v4.4s, v28.4s, v0.4s
-        SQRDMULH v5.4s, v29.4s, v0.4s
         CMEQ    v2.4s, v1.4s, 0
-        SQRDMULH v6.4s, v30.4s, v0.4s
-        SQRDMULH v7.4s, v31.4s, v0.4s
-        BIC     v28.16b, v28.16b, v2.16b
-        BIC     v29.16b, v29.16b, v2.16b
-        BIC     v30.16b, v30.16b, v2.16b
-        BIC     v31.16b, v31.16b, v2.16b
-        SSRA    v4.4s, v28.4s, 31       // signed shift right accumulate
-        SSRA    v5.4s, v29.4s, 31
-        SSRA    v6.4s, v30.4s, 31
-        SSRA    v7.4s, v31.4s, 31
-        SRSHL   v4.4s, v4.4s, v1.4s     // signed rounding shift left
-        SRSHL   v5.4s, v5.4s, v1.4s
-        SRSHL   v6.4s, v6.4s, v1.4s
-        SRSHL   v7.4s, v7.4s, v1.4s
+
+        BIC     v4.16b, v28.16b, v2.16b
+        BIC     v5.16b, v29.16b, v2.16b
+        BIC     v6.16b, v30.16b, v2.16b
+        BIC     v7.16b, v31.16b, v2.16b
+
+        SQRDMULH v28.4s, v28.4s, v0.4s
+        SQRDMULH v29.4s, v29.4s, v0.4s
+        SQRDMULH v30.4s, v30.4s, v0.4s
+        SQRDMULH v31.4s, v31.4s, v0.4s
+
+        SSRA    v28.4s, v4.4s, 31
+        SSRA    v29.4s, v5.4s, 31
+        SSRA    v30.4s, v6.4s, 31
+        SSRA    v31.4s, v7.4s, 31
+
+        SRSHL   v28.4s, v28.4s, v1.4s   // signed rounding shift left
+        SRSHL   v29.4s, v29.4s, v1.4s
+        SRSHL   v30.4s, v30.4s, v1.4s
+        SRSHL   v31.4s, v31.4s, v1.4s
+
         LD1R    {v2.8h}, [x11], 2       // add bias
-        SQXTN   v4.4h, v4.4s
-        SQXTN   v6.4h, v6.4s
-        SQXTN2  v4.8h, v5.4s
-        SQXTN2  v6.8h, v7.4s
+        SQXTN   v4.4h, v28.4s
+        SQXTN   v6.4h, v30.4s
+        SQXTN2  v4.8h, v29.4s
+        SQXTN2  v6.8h, v31.4s
+
         LD2R    {v0.16b, v1.16b}, [x11]   // clamp to min/max
         SQADD   v4.8h, v4.8h, v2.8h
         SQADD   v6.8h, v6.8h, v2.8h
diff --git a/src/qs8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-cortex-a55.S b/src/qs8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-cortex-a55.S
new file mode 100644
index 0000000..a270fa14
--- /dev/null
+++ b/src/qs8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-cortex-a55.S
@@ -0,0 +1,648 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/4x16c4-aarch64-neondot-cortex-a55.S.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <xnnpack/assembly.h>
+
+
+# void xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55(
+#     size_t mr,                 x0
+#     size_t nc,                 x1
+#     size_t kc,                 x2 / x0
+#     const int8_t* restrict a,  x3
+#     size_t a_stride,           x4
+#     const void* restrict w,    x5
+#     int8_t* restrict c,        x6
+#     size_t cm_stride,          x7
+#     size_t cn_stride,          [sp] -> x12
+#     const union xnn_qs8_minmax_params params)  [sp + 8] -> x11
+
+# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+# Register usage
+# A0  x3  v0  v4
+# A1 x15  v1  v5
+# A2 x13  v2  v6
+# A3  x4  v3  v7
+# B   x5  v8  v9 v10 v11
+# C0  x6 v16 v20 v24 v28
+# C1  x8 v17 v21 v25 v29
+# C2  x9 v18 v22 v26 v30
+# C3  x7 v19 v23 v27 v31
+# unused v12 v13 v14 v15
+
+BEGIN_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55
+
+        # Clamp A and C pointers
+        CMP     x0, 2                   // if mr < 2
+        STP     d8,  d9, [sp, -32]!
+        ADD     x15, x3, x4             // a1 = a0 + a_stride
+        ADD     x8, x6, x7              // c1 = c0 + cm_stride
+        STP     d10, d11, [sp, 16]
+        CSEL    x15, x3, x15, LO        //   a1 = a0
+        CSEL    x8, x6,  x8, LO         //   c1 = c0
+        ADD     x2, x2, 3               // kc = (kc + 3) & ~3
+
+        ADD     x13, x15, x4            // a2 = a1 + a_stride
+        ADD     x9,  x8, x7             // c2 = c1 + cm_stride
+                                        // if mr <= 2
+        CSEL    x13, x15, x13, LS       //   a2 = a1
+        CSEL    x9,  x8,  x9, LS        //   c2 = c1
+        BIC     x2, x2, 3
+
+        CMP     x0, 4                   // if mr < 4
+        ADD     x4, x13, x4             // a3 = a2 + a_stride
+        ADD     x7,  x9, x7             // c3 = c2 + cm_stride
+        CSEL    x4, x13, x4, LO         //   a3 = a2
+        CSEL    x7,  x9, x7, LO         //   c3 = c2
+
+        .p2align 3
+0:
+        # Load initial bias from w into accumulators
+        LDP     q16, q20, [x5], 32
+        MOV     v17.16b, v16.16b
+        MOV     v18.16b, v16.16b
+        LDP     q24, q28, [x5], 32
+        MOV     v19.16b, v16.16b
+        MOV     v21.16b, v20.16b
+        LDR     x11, [sp, 40]           // reload params
+        MOV     v22.16b, v20.16b
+        MOV     v23.16b, v20.16b
+        MOV     v25.16b, v24.16b
+        MOV     v26.16b, v24.16b
+        MOV     v27.16b, v24.16b
+        SUBS    x0, x2, 16              // k = kc - 16
+        MOV     v29.16b, v28.16b
+        MOV     v30.16b, v28.16b
+        MOV     v31.16b, v28.16b
+        # Is there at least 16 bytes for prologue/epilogue?
+        B.LO    4f
+
+        # prologue - read A and B values for block 0 and 1
+        LDR     d0,  [x3], 8
+        LDR     q8,  [x5], 16
+        LDR     d1, [x15], 8
+        LDR     d2, [x13], 8
+        LDR     d3,  [x4], 8
+        SUBS    x0, x0, 16              // is there 16 for main loop?
+        LDR     d9,  [x5], 8
+        LDR     x14,  [x5], 8
+        # Is there at least 16 bytes for main loop?
+        B.LO    2f
+
+        # Main loop - 16 bytes of A in 4 groups.
+        # 4 row of 4 vectors wide = 16 sdot instructions for 4 channels
+        # 4 LD64 for A
+        # 4 LD128 for W. = 2 LD64 + INS.
+        # for each 4 sdot, 1 LD64 for A, 2 LD64 for W + INS.
+
+        .p2align 3
+1:
+        # BLOCK 0
+        SDOT    v16.4s,  v8.16b, v0.4b[0]
+        LDR     d10,  [x5], 8
+        SDOT    v17.4s,  v8.16b, v1.4b[0]
+        INS     v9.d[1], x14
+        SDOT    v18.4s,  v8.16b, v2.4b[0]
+        LDR     x14,  [x5], 8
+        SDOT    v19.4s,  v8.16b, v3.4b[0]
+        LDR     d4,  [x3], 8
+
+        # BLOCK 1
+        SDOT    v20.4s,  v9.16b, v0.4b[0]
+        LDR     d11,  [x5], 8
+        SDOT    v21.4s,  v9.16b, v1.4b[0]
+        INS     v10.d[1], x14
+        SDOT    v22.4s,  v9.16b, v2.4b[0]
+        LDR     x14,  [x5], 8
+        SDOT    v23.4s,  v9.16b, v3.4b[0]
+        LDR     d5, [x15], 8
+
+        # BLOCK 2
+        SDOT    v24.4s, v10.16b, v0.4b[0]
+        LDR     d8,  [x5], 8
+        SDOT    v25.4s, v10.16b, v1.4b[0]
+        INS     v11.d[1], x14
+        SDOT    v26.4s, v10.16b, v2.4b[0]
+        LDR     x14,  [x5], 8
+        SDOT    v27.4s, v10.16b, v3.4b[0]
+        LDR     d6, [x13], 8
+
+        # BLOCK 3
+        SDOT    v28.4s, v11.16b, v0.4b[0]
+        LDR     d9,  [x5], 8
+        SDOT    v29.4s, v11.16b, v1.4b[0]
+        INS     v8.d[1], x14
+        SDOT    v30.4s, v11.16b, v2.4b[0]
+        LDR     x14,  [x5], 8
+        SDOT    v31.4s, v11.16b, v3.4b[0]
+        LDR     d7,  [x4], 8
+
+        # BLOCK 0
+        SDOT    v16.4s,  v8.16b, v0.4b[1]
+        LDR     d10,  [x5], 8
+        SDOT    v17.4s,  v8.16b, v1.4b[1]
+        INS     v9.d[1], x14
+        SDOT    v18.4s,  v8.16b, v2.4b[1]
+        LDR     x14,  [x5], 8
+        SDOT    v19.4s,  v8.16b, v3.4b[1]
+
+        # BLOCK 1
+        SDOT    v20.4s,  v9.16b, v0.4b[1]
+        LDR     d11,  [x5], 8
+        SDOT    v21.4s,  v9.16b, v1.4b[1]
+        INS     v10.d[1], x14
+        SDOT    v22.4s,  v9.16b, v2.4b[1]
+        LDR     x14,  [x5], 8
+        SDOT    v23.4s,  v9.16b, v3.4b[1]
+
+        # BLOCK 2
+        SDOT    v24.4s, v10.16b, v0.4b[1]
+        LDR     d8,  [x5], 8
+        SDOT    v25.4s, v10.16b, v1.4b[1]
+        INS     v11.d[1], x14
+        SDOT    v26.4s, v10.16b, v2.4b[1]
+        LDR     x14,  [x5], 8
+        SDOT    v27.4s, v10.16b, v3.4b[1]
+
+        # BLOCK 4
+        SDOT    v28.4s, v11.16b, v0.4b[1]
+        LDR     d9,  [x5], 8
+        SDOT    v29.4s, v11.16b, v1.4b[1]
+        INS     v8.d[1], x14
+        SDOT    v30.4s, v11.16b, v2.4b[1]
+        LDR     x14,  [x5], 8
+        SDOT    v31.4s, v11.16b, v3.4b[1]
+
+        # BLOCK 0
+        SDOT    v16.4s,  v8.16b, v4.4b[0]
+        LDR     d10,  [x5], 8
+        SDOT    v17.4s,  v8.16b, v5.4b[0]
+        INS     v9.d[1], x14
+        SDOT    v18.4s,  v8.16b, v6.4b[0]
+        LDR     x14,  [x5], 8
+        SDOT    v19.4s,  v8.16b, v7.4b[0]
+        LDR     d0,  [x3], 8
+
+        # BLOCK 1
+        SDOT    v20.4s,  v9.16b, v4.4b[0]
+        LDR     d11,  [x5], 8
+        SDOT    v21.4s,  v9.16b, v5.4b[0]
+        INS     v10.d[1], x14
+        SDOT    v22.4s,  v9.16b, v6.4b[0]
+        LDR     x14,  [x5], 8
+        SDOT    v23.4s,  v9.16b, v7.4b[0]
+        LDR     d1, [x15], 8
+
+        # BLOCK 2
+        SDOT    v24.4s, v10.16b, v4.4b[0]
+        LDR     d8,  [x5], 8
+        SDOT    v25.4s, v10.16b, v5.4b[0]
+        INS     v11.d[1], x14
+        SDOT    v26.4s, v10.16b, v6.4b[0]
+        LDR     x14,  [x5], 8
+        SDOT    v27.4s, v10.16b, v7.4b[0]
+        LDR     d2, [x13], 8
+
+        # BLOCK 3
+        SDOT    v28.4s, v11.16b, v4.4b[0]
+        LDR     d9,  [x5], 8
+        SDOT    v29.4s, v11.16b, v5.4b[0]
+        INS     v8.d[1], x14
+        SDOT    v30.4s, v11.16b, v6.4b[0]
+        LDR     x14,  [x5], 8
+        SDOT    v31.4s, v11.16b, v7.4b[0]
+        LDR     d3,  [x4], 8
+
+        # BLOCK 0
+        SDOT    v16.4s,  v8.16b, v4.4b[1]
+        LDR     d10,  [x5], 8
+        SDOT    v17.4s,  v8.16b, v5.4b[1]
+        INS     v9.d[1], x14
+        SDOT    v18.4s,  v8.16b, v6.4b[1]
+        LDR     x14,  [x5], 8
+        SDOT    v19.4s,  v8.16b, v7.4b[1]
+
+        # BLOCK 1
+        SDOT    v20.4s,  v9.16b, v4.4b[1]
+        LDR     d11,  [x5], 8
+        SDOT    v21.4s,  v9.16b, v5.4b[1]
+        INS     v10.d[1], x14
+        SDOT    v22.4s,  v9.16b, v6.4b[1]
+        LDR     x14,  [x5], 8
+        SDOT    v23.4s,  v9.16b, v7.4b[1]
+
+        # BLOCK 2
+        SDOT    v24.4s, v10.16b, v4.4b[1]
+        LDR     d8,  [x5], 8            // First B values for block 0 and 1
+        SDOT    v25.4s, v10.16b, v5.4b[1]
+        INS     v11.d[1], x14
+        SDOT    v26.4s, v10.16b, v6.4b[1]
+        LDR     x14,  [x5], 8
+        SDOT    v27.4s, v10.16b, v7.4b[1]
+        SUBS    x0, x0, 16
+
+        # BLOCK 3
+        SDOT    v28.4s, v11.16b, v4.4b[1]
+        LDR     d9,  [x5], 8
+        SDOT    v29.4s, v11.16b, v5.4b[1]
+        INS     v8.d[1], x14
+        SDOT    v30.4s, v11.16b, v6.4b[1]
+        LDR     x14,  [x5], 8
+        SDOT    v31.4s, v11.16b, v7.4b[1]
+        B.HS    1b
+
+        # Epilogue.  Same as main loop but no preloads in final group
+2:
+        # BLOCK 0
+        SDOT    v16.4s,  v8.16b, v0.4b[0]
+        LDR     d10,  [x5], 8
+        SDOT    v17.4s,  v8.16b, v1.4b[0]
+        INS     v9.d[1], x14
+        SDOT    v18.4s,  v8.16b, v2.4b[0]
+        LDR     x14,  [x5], 8
+        SDOT    v19.4s,  v8.16b, v3.4b[0]
+        LDR     d4,  [x3], 8
+
+        # BLOCK 1
+        SDOT    v20.4s,  v9.16b, v0.4b[0]
+        LDR     d11,  [x5], 8
+        SDOT    v21.4s,  v9.16b, v1.4b[0]
+        INS     v10.d[1], x14
+        SDOT    v22.4s,  v9.16b, v2.4b[0]
+        LDR     x14,  [x5], 8
+        SDOT    v23.4s,  v9.16b, v3.4b[0]
+        LDR     d5, [x15], 8
+
+        # BLOCK 2
+        SDOT    v24.4s, v10.16b, v0.4b[0]
+        LDR     d8,  [x5], 8
+        SDOT    v25.4s, v10.16b, v1.4b[0]
+        INS     v11.d[1], x14
+        SDOT    v26.4s, v10.16b, v2.4b[0]
+        LDR     x14,  [x5], 8
+        SDOT    v27.4s, v10.16b, v3.4b[0]
+        LDR     d6, [x13], 8
+
+        # BLOCK 3
+        SDOT    v28.4s, v11.16b, v0.4b[0]
+        LDR     d9,  [x5], 8
+        SDOT    v29.4s, v11.16b, v1.4b[0]
+        INS     v8.d[1], x14
+        SDOT    v30.4s, v11.16b, v2.4b[0]
+        LDR     x14,  [x5], 8
+        SDOT    v31.4s, v11.16b, v3.4b[0]
+        LDR     d7,  [x4], 8
+
+        # BLOCK 0
+        SDOT    v16.4s,  v8.16b, v0.4b[1]
+        LDR     d10,  [x5], 8
+        SDOT    v17.4s,  v8.16b, v1.4b[1]
+        INS     v9.d[1], x14
+        SDOT    v18.4s,  v8.16b, v2.4b[1]
+        LDR     x14,  [x5], 8
+        SDOT    v19.4s,  v8.16b, v3.4b[1]
+
+        # BLOCK 1
+        SDOT    v20.4s,  v9.16b, v0.4b[1]
+        LDR     d11,  [x5], 8
+        SDOT    v21.4s,  v9.16b, v1.4b[1]
+        INS     v10.d[1], x14
+        SDOT    v22.4s,  v9.16b, v2.4b[1]
+        LDR     x14,  [x5], 8
+        SDOT    v23.4s,  v9.16b, v3.4b[1]
+
+        # BLOCK 2
+        SDOT    v24.4s, v10.16b, v0.4b[1]
+        LDR     d8,  [x5], 8
+        SDOT    v25.4s, v10.16b, v1.4b[1]
+        INS     v11.d[1], x14
+        SDOT    v26.4s, v10.16b, v2.4b[1]
+        LDR     x14,  [x5], 8
+        SDOT    v27.4s, v10.16b, v3.4b[1]
+
+        # BLOCK 4
+        SDOT    v28.4s, v11.16b, v0.4b[1]
+        LDR     d9,  [x5], 8
+        SDOT    v29.4s, v11.16b, v1.4b[1]
+        INS     v8.d[1], x14
+        SDOT    v30.4s, v11.16b, v2.4b[1]
+        LDR     x14,  [x5], 8
+        SDOT    v31.4s, v11.16b, v3.4b[1]
+
+        # BLOCK 0
+        SDOT    v16.4s,  v8.16b, v4.4b[0]
+        LDR     d10,  [x5], 8
+        SDOT    v17.4s,  v8.16b, v5.4b[0]
+        INS     v9.d[1], x14
+        SDOT    v18.4s,  v8.16b, v6.4b[0]
+        LDR     x14,  [x5], 8
+        SDOT    v19.4s,  v8.16b, v7.4b[0]
+
+        # BLOCK 1
+        SDOT    v20.4s,  v9.16b, v4.4b[0]
+        LDR     d11,  [x5], 8
+        SDOT    v21.4s,  v9.16b, v5.4b[0]
+        INS     v10.d[1], x14
+        SDOT    v22.4s,  v9.16b, v6.4b[0]
+        LDR     x14,  [x5], 8
+        SDOT    v23.4s,  v9.16b, v7.4b[0]
+
+        # BLOCK 2
+        SDOT    v24.4s, v10.16b, v4.4b[0]
+        LDR     d8,  [x5], 8
+        SDOT    v25.4s, v10.16b, v5.4b[0]
+        INS     v11.d[1], x14
+        SDOT    v26.4s, v10.16b, v6.4b[0]
+        LDR     x14,  [x5], 8
+        SDOT    v27.4s, v10.16b, v7.4b[0]
+
+        # BLOCK 3
+        SDOT    v28.4s, v11.16b, v4.4b[0]
+        LDR     d9,  [x5], 8
+        SDOT    v29.4s, v11.16b, v5.4b[0]
+        INS     v8.d[1], x14
+        SDOT    v30.4s, v11.16b, v6.4b[0]
+        LDR     x14,  [x5], 8
+        SDOT    v31.4s, v11.16b, v7.4b[0]
+
+        # BLOCK 0
+        SDOT    v16.4s,  v8.16b, v4.4b[1]
+        LDR     d10,  [x5], 8
+        SDOT    v17.4s,  v8.16b, v5.4b[1]
+        INS     v9.d[1], x14
+        SDOT    v18.4s,  v8.16b, v6.4b[1]
+        LDR     x14,  [x5], 8
+        SDOT    v19.4s,  v8.16b, v7.4b[1]
+
+        # BLOCK 1
+        SDOT    v20.4s,  v9.16b, v4.4b[1]
+        LDR     d11,  [x5], 8
+        SDOT    v21.4s,  v9.16b, v5.4b[1]
+        INS     v10.d[1], x14
+        SDOT    v22.4s,  v9.16b, v6.4b[1]
+        LDR     x14,  [x5], 8
+        SDOT    v23.4s,  v9.16b, v7.4b[1]
+
+        # BLOCK 2
+        SDOT    v24.4s, v10.16b, v4.4b[1]
+        SDOT    v25.4s, v10.16b, v5.4b[1]
+        INS     v11.d[1], x14
+        SDOT    v26.4s, v10.16b, v6.4b[1]
+        SDOT    v27.4s, v10.16b, v7.4b[1]
+        AND     x0, x2, 15              // kc remainder 0 to 12
+
+        # BLOCK 3
+        SDOT    v28.4s, v11.16b, v4.4b[1]
+        SDOT    v29.4s, v11.16b, v5.4b[1]
+        SDOT    v30.4s, v11.16b, v6.4b[1]
+        SDOT    v31.4s, v11.16b, v7.4b[1]
+
+        # Is there a remainder?- 4 to 12 bytes of A
+        CBNZ    x0, 5f
+
+        .p2align 3
+3:
+        # Apply params - scale, bias and clamp
+        LD1R    {v0.4s}, [x11], 4
+
+        SCVTF   v16.4s, v16.4s
+        SCVTF   v17.4s, v17.4s
+        SCVTF   v18.4s, v18.4s
+        SCVTF   v19.4s, v19.4s
+        SCVTF   v20.4s, v20.4s
+        SCVTF   v21.4s, v21.4s
+        SCVTF   v22.4s, v22.4s
+        SCVTF   v23.4s, v23.4s
+        SCVTF   v24.4s, v24.4s
+        SCVTF   v25.4s, v25.4s
+        SCVTF   v26.4s, v26.4s
+        SCVTF   v27.4s, v27.4s
+        SCVTF   v28.4s, v28.4s
+        SCVTF   v29.4s, v29.4s
+        SCVTF   v30.4s, v30.4s
+        SCVTF   v31.4s, v31.4s
+
+        FMUL    v16.4s, v0.4s, v16.4s
+        FMUL    v17.4s, v0.4s, v17.4s
+        FMUL    v18.4s, v0.4s, v18.4s
+        FMUL    v19.4s, v0.4s, v19.4s
+        FMUL    v20.4s, v0.4s, v20.4s
+        FMUL    v21.4s, v0.4s, v21.4s
+        FMUL    v22.4s, v0.4s, v22.4s
+        FMUL    v23.4s, v0.4s, v23.4s
+        FMUL    v24.4s, v0.4s, v24.4s
+        FMUL    v25.4s, v0.4s, v25.4s
+        FMUL    v26.4s, v0.4s, v26.4s
+        FMUL    v27.4s, v0.4s, v27.4s
+        FMUL    v28.4s, v0.4s, v28.4s
+        FMUL    v29.4s, v0.4s, v29.4s
+        FMUL    v30.4s, v0.4s, v30.4s
+        FMUL    v31.4s, v0.4s, v31.4s
+
+        FCVTNS  v16.4s, v16.4s
+        FCVTNS  v17.4s, v17.4s
+        FCVTNS  v18.4s, v18.4s
+        FCVTNS  v19.4s, v19.4s
+        FCVTNS  v20.4s, v20.4s
+        FCVTNS  v21.4s, v21.4s
+        FCVTNS  v22.4s, v22.4s
+        FCVTNS  v23.4s, v23.4s
+        FCVTNS  v24.4s, v24.4s
+        FCVTNS  v25.4s, v25.4s
+        FCVTNS  v26.4s, v26.4s
+        FCVTNS  v27.4s, v27.4s
+        FCVTNS  v28.4s, v28.4s
+        FCVTNS  v29.4s, v29.4s
+        FCVTNS  v30.4s, v30.4s
+        FCVTNS  v31.4s, v31.4s
+
+        SQXTN   v16.4h, v16.4s
+        SQXTN   v17.4h, v17.4s
+        SQXTN   v18.4h, v18.4s
+        SQXTN   v19.4h, v19.4s
+        SQXTN   v24.4h, v24.4s
+        SQXTN   v25.4h, v25.4s
+        SQXTN   v26.4h, v26.4s
+        SQXTN   v27.4h, v27.4s
+        LD1R    {v2.8h}, [x11], 2       // add bias
+
+        SQXTN2  v16.8h, v20.4s
+        SQXTN2  v17.8h, v21.4s
+        SQXTN2  v18.8h, v22.4s
+        SQXTN2  v19.8h, v23.4s
+        SQXTN2  v24.8h, v28.4s
+        SQXTN2  v25.8h, v29.4s
+        SQXTN2  v26.8h, v30.4s
+        SQXTN2  v27.8h, v31.4s
+
+        SQADD   v16.8h, v16.8h, v2.8h
+        SQADD   v17.8h, v17.8h, v2.8h
+        SQADD   v18.8h, v18.8h, v2.8h
+        SQADD   v19.8h, v19.8h, v2.8h
+        SQADD   v24.8h, v24.8h, v2.8h
+        SQADD   v25.8h, v25.8h, v2.8h
+        SQADD   v26.8h, v26.8h, v2.8h
+        SQADD   v27.8h, v27.8h, v2.8h
+        LD1R    {v0.16b}, [x11], 1      // clamp min value
+
+        SQXTN   v4.8b, v16.8h
+        SQXTN   v5.8b, v17.8h
+        SQXTN   v6.8b, v18.8h
+        SQXTN   v7.8b, v19.8h
+        LD1R    {v1.16b}, [x11]         // clamp max value
+        SQXTN2  v4.16b, v24.8h
+        SQXTN2  v5.16b, v25.8h
+        SQXTN2  v6.16b, v26.8h
+        SQXTN2  v7.16b, v27.8h
+        LDR     x12, [sp, 32]           // cn_stride
+
+        SMAX    v4.16b, v4.16b, v0.16b
+        SMAX    v5.16b, v5.16b, v0.16b
+        SMAX    v6.16b, v6.16b, v0.16b
+        SMAX    v7.16b, v7.16b, v0.16b
+        SUBS    x1, x1, 16
+        SMIN    v4.16b, v4.16b, v1.16b
+        SMIN    v5.16b, v5.16b, v1.16b
+        SMIN    v6.16b, v6.16b, v1.16b
+        SMIN    v7.16b, v7.16b, v1.16b
+        B.LO    6f
+
+        # Store full 4 x 16
+        ST1     {v4.16b}, [x6], x12
+        SUB     x3,  x3, x2             // a0 -= kc
+        ST1     {v5.16b}, [x8], x12
+        SUB     x15, x15, x2            // a1 -= kc
+        ST1     {v6.16b}, [x9], x12
+        SUB     x13, x13, x2            // a2 -= kc
+        ST1     {v7.16b}, [x7], x12
+        SUB     x4,  x4, x2             // a3 -= kc
+        B.NE    0b
+
+        LDP     d10, d11, [sp, 16]
+        LDP     d8,  d9, [sp], 32
+        RET
+
+        # Remainder- 4 to 12 bytes of A
+        # Although C4, its safe to read 16 bytes.
+        .p2align 3
+4:
+        AND     x0, x2, 15              // kc remainder 4 to 12
+5:
+        LDP     q8,  q9,  [x5], 32
+        LDP     q10, q11,  [x5], 32
+        LD1     {v0.16b},  [x3], x0
+        LD1     {v1.16b}, [x15], x0
+        LD1     {v2.16b}, [x13], x0
+        LD1     {v3.16b},  [x4], x0
+        SDOT    v16.4s,  v8.16b, v0.4b[0]
+        SDOT    v17.4s,  v8.16b, v1.4b[0]
+        SDOT    v18.4s,  v8.16b, v2.4b[0]
+        SDOT    v19.4s,  v8.16b, v3.4b[0]
+        SDOT    v20.4s,  v9.16b, v0.4b[0]
+        SDOT    v21.4s,  v9.16b, v1.4b[0]
+        SDOT    v22.4s,  v9.16b, v2.4b[0]
+        SDOT    v23.4s,  v9.16b, v3.4b[0]
+        SDOT    v24.4s, v10.16b, v0.4b[0]
+        SDOT    v25.4s, v10.16b, v1.4b[0]
+        SDOT    v26.4s, v10.16b, v2.4b[0]
+        SDOT    v27.4s, v10.16b, v3.4b[0]
+        SDOT    v28.4s, v11.16b, v0.4b[0]
+        SDOT    v29.4s, v11.16b, v1.4b[0]
+        SDOT    v30.4s, v11.16b, v2.4b[0]
+        SDOT    v31.4s, v11.16b, v3.4b[0]
+        CMP     x0, 4
+        B.LS    3b
+        LDP     q8,  q9,  [x5], 32
+        LDP     q10, q11,  [x5], 32
+        SDOT    v16.4s,  v8.16b, v0.4b[1]
+        SDOT    v17.4s,  v8.16b, v1.4b[1]
+        SDOT    v18.4s,  v8.16b, v2.4b[1]
+        SDOT    v19.4s,  v8.16b, v3.4b[1]
+        SDOT    v20.4s,  v9.16b, v0.4b[1]
+        SDOT    v21.4s,  v9.16b, v1.4b[1]
+        SDOT    v22.4s,  v9.16b, v2.4b[1]
+        SDOT    v23.4s,  v9.16b, v3.4b[1]
+        SDOT    v24.4s, v10.16b, v0.4b[1]
+        SDOT    v25.4s, v10.16b, v1.4b[1]
+        SDOT    v26.4s, v10.16b, v2.4b[1]
+        SDOT    v27.4s, v10.16b, v3.4b[1]
+        SDOT    v28.4s, v11.16b, v0.4b[1]
+        SDOT    v29.4s, v11.16b, v1.4b[1]
+        SDOT    v30.4s, v11.16b, v2.4b[1]
+        SDOT    v31.4s, v11.16b, v3.4b[1]
+        CMP     x0, 8
+        B.LS    3b
+        LDP     q8,  q9,  [x5], 32
+        LDP     q10, q11,  [x5], 32
+        SDOT    v16.4s,  v8.16b, v0.4b[2]
+        SDOT    v17.4s,  v8.16b, v1.4b[2]
+        SDOT    v18.4s,  v8.16b, v2.4b[2]
+        SDOT    v19.4s,  v8.16b, v3.4b[2]
+        SDOT    v20.4s,  v9.16b, v0.4b[2]
+        SDOT    v21.4s,  v9.16b, v1.4b[2]
+        SDOT    v22.4s,  v9.16b, v2.4b[2]
+        SDOT    v23.4s,  v9.16b, v3.4b[2]
+        SDOT    v24.4s, v10.16b, v0.4b[2]
+        SDOT    v25.4s, v10.16b, v1.4b[2]
+        SDOT    v26.4s, v10.16b, v2.4b[2]
+        SDOT    v27.4s, v10.16b, v3.4b[2]
+        SDOT    v28.4s, v11.16b, v0.4b[2]
+        SDOT    v29.4s, v11.16b, v1.4b[2]
+        SDOT    v30.4s, v11.16b, v2.4b[2]
+        SDOT    v31.4s, v11.16b, v3.4b[2]
+        B       3b
+
+        # Store odd width
+        .p2align 3
+6:
+        TBZ     x1, 3, 7f
+        STR     d4, [x6], 8
+        DUP     d4, v4.d[1]
+        STR     d5, [x8], 8
+        DUP     d5, v5.d[1]
+        STR     d6, [x9], 8
+        DUP     d6, v6.d[1]
+        STR     d7, [x7], 8
+        DUP     d7, v7.d[1]
+7:
+        TBZ     x1, 2, 8f
+        STR     s4, [x6], 4
+        DUP     s4, v4.s[1]
+        STR     s5, [x8], 4
+        DUP     s5, v5.s[1]
+        STR     s6, [x9], 4
+        DUP     s6, v6.s[1]
+        STR     s7, [x7], 4
+        DUP     s7, v7.s[1]
+8:
+        TBZ     x1, 1, 9f
+        ST1     {v4.h}[0], [x6], 2
+        DUP     h4, v4.h[1]
+        ST1     {v5.h}[0], [x8], 2
+        DUP     h5, v5.h[1]
+        ST1     {v6.h}[0], [x9], 2
+        DUP     h6, v6.h[1]
+        ST1     {v7.h}[0], [x7], 2
+        DUP     h7, v7.h[1]
+9:
+        TBZ     x1, 0, 10f
+        ST1     {v4.b}[0], [x6]
+        ST1     {v5.b}[0], [x8]
+        ST1     {v6.b}[0], [x9]
+        ST1     {v7.b}[0], [x7]
+10:
+        LDP     d10, d11, [sp, 16]
+        LDP     d8,  d9, [sp], 32
+        RET
+
+END_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/qs8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld32.S b/src/qs8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld32.S
new file mode 100644
index 0000000..e48aaef
--- /dev/null
+++ b/src/qs8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld32.S
@@ -0,0 +1,274 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/4x16c4-aarch64-neondot-ld32.S.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <xnnpack/assembly.h>
+
+
+# void xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32(
+#     size_t mr,                 x0
+#     size_t nc,                 x1
+#     size_t kc,                 x2 / x0
+#     const int8_t* restrict a,  x3
+#     size_t a_stride,           x4
+#     const void* restrict w,    x5
+#     int8_t* restrict c,        x6
+#     size_t cm_stride,          x7
+#     size_t cn_stride,          [sp] -> x12
+#     const union xnn_qs8_minmax_params params)  [sp + 8] -> x11
+
+# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+# Register usage
+# A0  x3 v0
+# A1 x15 v1
+# A2 x13 v2
+# A3  x4 v3
+# B   x5 v4  v5  v6  v7
+# C0  x6 v16 v20 v24 v28
+# C1  x8 v17 v21 v25 v29
+# C2  x9 v18 v22 v26 v30
+# C3  x7 v19 v23 v27 v31
+# unused v8 v9 v10 v11 v12 v13 v14 v15
+
+BEGIN_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32
+
+        # Clamp A and C pointers
+        CMP     x0, 2                   // if mr < 2
+        ADD     x2, x2, 3               // kc = (kc + 3) & ~3
+        ADD     x15, x3, x4             // a1 = a0 + a_stride
+        ADD     x8, x6, x7              // c1 = c0 + cm_stride
+        CSEL    x15, x3, x15, LO        //   a1 = a0
+        CSEL    x8, x6,  x8, LO         //   c1 = c0
+        BIC     x2, x2, 3
+
+        ADD     x13, x15, x4            // a2 = a1 + a_stride
+        ADD     x9,  x8, x7             // c2 = c1 + cm_stride
+                                        // if mr <= 2
+        CSEL    x13, x15, x13, LS       //   a2 = a1
+        CSEL    x9,  x8,  x9, LS        //   c2 = c1
+
+        CMP     x0, 4                   // if mr < 4
+        ADD     x4, x13, x4             // a3 = a2 + a_stride
+        ADD     x7,  x9, x7             // c3 = c2 + cm_stride
+        CSEL    x4, x13, x4, LO         //   a3 = a2
+        CSEL    x7,  x9, x7, LO         //   c3 = c2
+
+        .p2align 3
+0:
+        # Load initial bias from w into accumulators
+        LDP     q16, q20, [x5], 32
+        MOV     v17.16b, v16.16b
+        MOV     v18.16b, v16.16b
+        LDP     q24, q28, [x5], 32
+        MOV     v19.16b, v16.16b
+        MOV     v21.16b, v20.16b
+        LDR     x11, [sp, 8]            // reload params
+        MOV     v22.16b, v20.16b
+        MOV     v23.16b, v20.16b
+        MOV     x0, x2                  // k = kc.  assumes kc > 0
+        MOV     v25.16b, v24.16b
+        MOV     v26.16b, v24.16b
+        MOV     v27.16b, v24.16b
+        MOV     v29.16b, v28.16b
+        MOV     v30.16b, v28.16b
+        MOV     v31.16b, v28.16b
+
+        # Main loop - 4 bytes of A
+        .p2align 3
+1:
+        LDR     s0,  [x3], 4
+        LDR     q4, [x5], 16
+        LDR     s1, [x15], 4
+        LDR     s2, [x13], 4
+        LDR     s3,  [x4], 4
+        SDOT    v16.4s, v4.16b, v0.4b[0]
+        SDOT    v17.4s, v4.16b, v1.4b[0]
+        LDR     q5, [x5], 16
+        SDOT    v18.4s, v4.16b, v2.4b[0]
+        SDOT    v19.4s, v4.16b, v3.4b[0]
+        LDR     q6, [x5], 16
+        SDOT    v20.4s, v5.16b, v0.4b[0]
+        SDOT    v21.4s, v5.16b, v1.4b[0]
+        LDR     q7, [x5], 16
+        SDOT    v22.4s, v5.16b, v2.4b[0]
+        SDOT    v23.4s, v5.16b, v3.4b[0]
+        SUBS    x0, x0, 4
+        SDOT    v24.4s, v6.16b, v0.4b[0]
+        SDOT    v25.4s, v6.16b, v1.4b[0]
+        SDOT    v26.4s, v6.16b, v2.4b[0]
+        SDOT    v27.4s, v6.16b, v3.4b[0]
+        SDOT    v28.4s, v7.16b, v0.4b[0]
+        SDOT    v29.4s, v7.16b, v1.4b[0]
+        SDOT    v30.4s, v7.16b, v2.4b[0]
+        SDOT    v31.4s, v7.16b, v3.4b[0]
+        B.HI    1b
+
+        # Apply params - scale, bias and clamp
+        LD1R    {v0.4s}, [x11], 4
+
+        SCVTF   v16.4s, v16.4s
+        SCVTF   v17.4s, v17.4s
+        SCVTF   v18.4s, v18.4s
+        SCVTF   v19.4s, v19.4s
+        SCVTF   v20.4s, v20.4s
+        SCVTF   v21.4s, v21.4s
+        SCVTF   v22.4s, v22.4s
+        SCVTF   v23.4s, v23.4s
+        SCVTF   v24.4s, v24.4s
+        SCVTF   v25.4s, v25.4s
+        SCVTF   v26.4s, v26.4s
+        SCVTF   v27.4s, v27.4s
+        SCVTF   v28.4s, v28.4s
+        SCVTF   v29.4s, v29.4s
+        SCVTF   v30.4s, v30.4s
+        SCVTF   v31.4s, v31.4s
+
+        FMUL    v16.4s, v0.4s, v16.4s
+        FMUL    v17.4s, v0.4s, v17.4s
+        FMUL    v18.4s, v0.4s, v18.4s
+        FMUL    v19.4s, v0.4s, v19.4s
+        FMUL    v20.4s, v0.4s, v20.4s
+        FMUL    v21.4s, v0.4s, v21.4s
+        FMUL    v22.4s, v0.4s, v22.4s
+        FMUL    v23.4s, v0.4s, v23.4s
+        FMUL    v24.4s, v0.4s, v24.4s
+        FMUL    v25.4s, v0.4s, v25.4s
+        FMUL    v26.4s, v0.4s, v26.4s
+        FMUL    v27.4s, v0.4s, v27.4s
+        FMUL    v28.4s, v0.4s, v28.4s
+        FMUL    v29.4s, v0.4s, v29.4s
+        FMUL    v30.4s, v0.4s, v30.4s
+        FMUL    v31.4s, v0.4s, v31.4s
+
+        FCVTNS  v16.4s, v16.4s
+        FCVTNS  v17.4s, v17.4s
+        FCVTNS  v18.4s, v18.4s
+        FCVTNS  v19.4s, v19.4s
+        FCVTNS  v20.4s, v20.4s
+        FCVTNS  v21.4s, v21.4s
+        FCVTNS  v22.4s, v22.4s
+        FCVTNS  v23.4s, v23.4s
+        FCVTNS  v24.4s, v24.4s
+        FCVTNS  v25.4s, v25.4s
+        FCVTNS  v26.4s, v26.4s
+        FCVTNS  v27.4s, v27.4s
+        FCVTNS  v28.4s, v28.4s
+        FCVTNS  v29.4s, v29.4s
+        FCVTNS  v30.4s, v30.4s
+        FCVTNS  v31.4s, v31.4s
+
+        SQXTN   v16.4h, v16.4s
+        SQXTN   v17.4h, v17.4s
+        SQXTN   v18.4h, v18.4s
+        SQXTN   v19.4h, v19.4s
+        SQXTN   v24.4h, v24.4s
+        SQXTN   v25.4h, v25.4s
+        SQXTN   v26.4h, v26.4s
+        SQXTN   v27.4h, v27.4s
+        LD1R    {v2.8h}, [x11], 2       // add bias
+
+        SQXTN2  v16.8h, v20.4s
+        SQXTN2  v17.8h, v21.4s
+        SQXTN2  v18.8h, v22.4s
+        SQXTN2  v19.8h, v23.4s
+        SQXTN2  v24.8h, v28.4s
+        SQXTN2  v25.8h, v29.4s
+        SQXTN2  v26.8h, v30.4s
+        SQXTN2  v27.8h, v31.4s
+
+        SQADD   v16.8h, v16.8h, v2.8h
+        SQADD   v17.8h, v17.8h, v2.8h
+        SQADD   v18.8h, v18.8h, v2.8h
+        SQADD   v19.8h, v19.8h, v2.8h
+        SQADD   v24.8h, v24.8h, v2.8h
+        SQADD   v25.8h, v25.8h, v2.8h
+        SQADD   v26.8h, v26.8h, v2.8h
+        SQADD   v27.8h, v27.8h, v2.8h
+        LD1R    {v0.16b}, [x11], 1      // clamp min value
+
+        SQXTN   v4.8b, v16.8h
+        SQXTN   v5.8b, v17.8h
+        SQXTN   v6.8b, v18.8h
+        SQXTN   v7.8b, v19.8h
+        LD1R    {v1.16b}, [x11]         // clamp max value
+        SQXTN2  v4.16b, v24.8h
+        SQXTN2  v5.16b, v25.8h
+        SQXTN2  v6.16b, v26.8h
+        SQXTN2  v7.16b, v27.8h
+        LDR     x12, [sp]               // cn_stride
+
+        SMAX    v4.16b, v4.16b, v0.16b
+        SMAX    v5.16b, v5.16b, v0.16b
+        SMAX    v6.16b, v6.16b, v0.16b
+        SMAX    v7.16b, v7.16b, v0.16b
+        SUBS    x1, x1, 16
+        SMIN    v4.16b, v4.16b, v1.16b
+        SMIN    v5.16b, v5.16b, v1.16b
+        SMIN    v6.16b, v6.16b, v1.16b
+        SMIN    v7.16b, v7.16b, v1.16b
+        B.LO    2f
+
+        # Store full 4 x 16
+        ST1     {v4.16b}, [x6], x12
+        SUB     x3,  x3, x2             // a0 -= kc
+        ST1     {v5.16b}, [x8], x12
+        SUB     x15, x15, x2            // a1 -= kc
+        ST1     {v6.16b}, [x9], x12
+        SUB     x13, x13, x2            // a2 -= kc
+        ST1     {v7.16b}, [x7], x12
+        SUB     x4,  x4, x2             // a3 -= kc
+        B.NE    0b
+        RET
+
+        # Store odd width
+        .p2align 3
+2:
+        TBZ     x1, 3, 3f
+        STR     d4, [x6], 8
+        DUP     d4, v4.d[1]
+        STR     d5, [x8], 8
+        DUP     d5, v5.d[1]
+        STR     d6, [x9], 8
+        DUP     d6, v6.d[1]
+        STR     d7, [x7], 8
+        DUP     d7, v7.d[1]
+3:
+        TBZ     x1, 2, 4f
+        STR     s4, [x6], 4
+        DUP     s4, v4.s[1]
+        STR     s5, [x8], 4
+        DUP     s5, v5.s[1]
+        STR     s6, [x9], 4
+        DUP     s6, v6.s[1]
+        STR     s7, [x7], 4
+        DUP     s7, v7.s[1]
+4:
+        TBZ     x1, 1, 5f
+        ST1     {v4.h}[0], [x6], 2
+        DUP     h4, v4.h[1]
+        ST1     {v5.h}[0], [x8], 2
+        DUP     h5, v5.h[1]
+        ST1     {v6.h}[0], [x9], 2
+        DUP     h6, v6.h[1]
+        ST1     {v7.h}[0], [x7], 2
+        DUP     h7, v7.h[1]
+5:
+        TBZ     x1, 0, 6f
+        ST1     {v4.b}[0], [x6]
+        ST1     {v5.b}[0], [x8]
+        ST1     {v6.b}[0], [x9]
+        ST1     {v7.b}[0], [x7]
+6:
+        RET
+
+END_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/qs8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld64.S b/src/qs8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld64.S
new file mode 100644
index 0000000..4a4303d
--- /dev/null
+++ b/src/qs8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld64.S
@@ -0,0 +1,325 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/4x16c4-aarch64-neondot-ld64.S.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <xnnpack/assembly.h>
+
+
+# void xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64(
+#     size_t mr,                 x0
+#     size_t nc,                 x1
+#     size_t kc,                 x2 / x0
+#     const int8_t* restrict a,  x3
+#     size_t a_stride,           x4
+#     const void* restrict w,    x5
+#     int8_t* restrict c,        x6
+#     size_t cm_stride,          x7
+#     size_t cn_stride,          [sp] -> x12
+#     const union xnn_qs8_minmax_params params)  [sp + 8] -> x11
+
+# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+# Register usage
+# A0  x3 v0
+# A1 x15 v1
+# A2 x13 v2
+# A3  x4 v3
+# B   x5 v4  v5  v6  v7
+# C0  x6 v16 v20 v24 v28
+# C1  x8 v17 v21 v25 v29
+# C2  x9 v18 v22 v26 v30
+# C3  x7 v19 v23 v27 v31
+# unused v8 v9 v10 v11 v12 v13 v14 v15
+
+BEGIN_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64
+
+        # Clamp A and C pointers
+        CMP     x0, 2                   // if mr < 2
+        ADD     x2, x2, 3               // kc = (kc + 3) & ~3
+        ADD     x15, x3, x4             // a1 = a0 + a_stride
+        ADD     x8, x6, x7              // c1 = c0 + cm_stride
+        CSEL    x15, x3, x15, LO        //   a1 = a0
+        CSEL    x8, x6,  x8, LO         //   c1 = c0
+        BIC     x2, x2, 3
+
+        ADD     x13, x15, x4            // a2 = a1 + a_stride
+        ADD     x9,  x8, x7             // c2 = c1 + cm_stride
+                                        // if mr <= 2
+        CSEL    x13, x15, x13, LS       //   a2 = a1
+        CSEL    x9,  x8,  x9, LS        //   c2 = c1
+
+        CMP     x0, 4                   // if mr < 4
+        ADD     x4, x13, x4             // a3 = a2 + a_stride
+        ADD     x7,  x9, x7             // c3 = c2 + cm_stride
+        CSEL    x4, x13, x4, LO         //   a3 = a2
+        CSEL    x7,  x9, x7, LO         //   c3 = c2
+
+        .p2align 3
+0:
+        # Load initial bias from w into accumulators
+        LDP     q16, q20, [x5], 32
+        MOV     v17.16b, v16.16b
+        MOV     v18.16b, v16.16b
+        LDP     q24, q28, [x5], 32
+        MOV     v19.16b, v16.16b
+        MOV     v21.16b, v20.16b
+        LDR     x11, [sp, 8]            // params
+        MOV     v22.16b, v20.16b
+        MOV     v23.16b, v20.16b
+        SUBS    x0, x2, 8               // k = kc - 8
+        MOV     v25.16b, v24.16b
+        MOV     v26.16b, v24.16b
+        MOV     v27.16b, v24.16b
+        MOV     v29.16b, v28.16b
+        MOV     v30.16b, v28.16b
+        MOV     v31.16b, v28.16b
+        # Is there at least 8 bytes?
+        B.LO    3f
+
+        # Main loop - 8 bytes of A
+        .p2align 3
+1:
+        LDR     d0,  [x3], 8
+        LDR     q4,  [x5], 16
+        LDR     d1, [x15], 8
+        LDR     d2, [x13], 8
+        LDR     d3,  [x4], 8
+        LDR     q5,  [x5], 16
+        SDOT    v16.4s, v4.16b,  v0.4b[0]
+        SDOT    v17.4s, v4.16b,  v1.4b[0]
+        LDP     q6, q7, [x5], 32
+        SDOT    v18.4s, v4.16b,  v2.4b[0]
+        SDOT    v19.4s, v4.16b,  v3.4b[0]
+        SDOT    v20.4s, v5.16b,  v0.4b[0]
+        SDOT    v21.4s, v5.16b,  v1.4b[0]
+        SDOT    v22.4s, v5.16b,  v2.4b[0]
+        SDOT    v23.4s, v5.16b,  v3.4b[0]
+        SDOT    v24.4s, v6.16b, v0.4b[0]
+        SDOT    v25.4s, v6.16b, v1.4b[0]
+        LDP     q4, q5, [x5], 32
+        SDOT    v26.4s, v6.16b, v2.4b[0]
+        SDOT    v27.4s, v6.16b, v3.4b[0]
+        SDOT    v28.4s, v7.16b, v0.4b[0]
+        SDOT    v29.4s, v7.16b, v1.4b[0]
+        SDOT    v30.4s, v7.16b, v2.4b[0]
+        SDOT    v31.4s, v7.16b, v3.4b[0]
+        SDOT    v16.4s, v4.16b,  v0.4b[1]
+        SDOT    v17.4s, v4.16b,  v1.4b[1]
+        LDP     q6, q7, [x5], 32
+        SDOT    v18.4s, v4.16b,  v2.4b[1]
+        SDOT    v19.4s, v4.16b,  v3.4b[1]
+        SDOT    v20.4s, v5.16b,  v0.4b[1]
+        SDOT    v21.4s, v5.16b,  v1.4b[1]
+        SDOT    v22.4s, v5.16b,  v2.4b[1]
+        SDOT    v23.4s, v5.16b,  v3.4b[1]
+        SDOT    v24.4s, v6.16b,  v0.4b[1]
+        SDOT    v25.4s, v6.16b,  v1.4b[1]
+        SDOT    v26.4s, v6.16b,  v2.4b[1]
+        SDOT    v27.4s, v6.16b,  v3.4b[1]
+        SDOT    v28.4s, v7.16b,  v0.4b[1]
+        SDOT    v29.4s, v7.16b,  v1.4b[1]
+        SDOT    v30.4s, v7.16b,  v2.4b[1]
+        SUBS    x0, x0, 8
+        SDOT    v31.4s, v7.16b,  v3.4b[1]
+        B.HS    1b
+
+        # Is there a remainder?- 4 bytes of A
+        TBNZ    x0, 2, 3f
+
+2:
+        # Apply params - scale, bias and clamp
+        LD1R    {v0.4s}, [x11], 4
+
+        SCVTF   v16.4s, v16.4s
+        SCVTF   v17.4s, v17.4s
+        SCVTF   v18.4s, v18.4s
+        SCVTF   v19.4s, v19.4s
+        SCVTF   v20.4s, v20.4s
+        SCVTF   v21.4s, v21.4s
+        SCVTF   v22.4s, v22.4s
+        SCVTF   v23.4s, v23.4s
+        SCVTF   v24.4s, v24.4s
+        SCVTF   v25.4s, v25.4s
+        SCVTF   v26.4s, v26.4s
+        SCVTF   v27.4s, v27.4s
+        SCVTF   v28.4s, v28.4s
+        SCVTF   v29.4s, v29.4s
+        SCVTF   v30.4s, v30.4s
+        SCVTF   v31.4s, v31.4s
+
+        FMUL    v16.4s, v0.4s, v16.4s
+        FMUL    v17.4s, v0.4s, v17.4s
+        FMUL    v18.4s, v0.4s, v18.4s
+        FMUL    v19.4s, v0.4s, v19.4s
+        FMUL    v20.4s, v0.4s, v20.4s
+        FMUL    v21.4s, v0.4s, v21.4s
+        FMUL    v22.4s, v0.4s, v22.4s
+        FMUL    v23.4s, v0.4s, v23.4s
+        FMUL    v24.4s, v0.4s, v24.4s
+        FMUL    v25.4s, v0.4s, v25.4s
+        FMUL    v26.4s, v0.4s, v26.4s
+        FMUL    v27.4s, v0.4s, v27.4s
+        FMUL    v28.4s, v0.4s, v28.4s
+        FMUL    v29.4s, v0.4s, v29.4s
+        FMUL    v30.4s, v0.4s, v30.4s
+        FMUL    v31.4s, v0.4s, v31.4s
+
+        FCVTNS  v16.4s, v16.4s
+        FCVTNS  v17.4s, v17.4s
+        FCVTNS  v18.4s, v18.4s
+        FCVTNS  v19.4s, v19.4s
+        FCVTNS  v20.4s, v20.4s
+        FCVTNS  v21.4s, v21.4s
+        FCVTNS  v22.4s, v22.4s
+        FCVTNS  v23.4s, v23.4s
+        FCVTNS  v24.4s, v24.4s
+        FCVTNS  v25.4s, v25.4s
+        FCVTNS  v26.4s, v26.4s
+        FCVTNS  v27.4s, v27.4s
+        FCVTNS  v28.4s, v28.4s
+        FCVTNS  v29.4s, v29.4s
+        FCVTNS  v30.4s, v30.4s
+        FCVTNS  v31.4s, v31.4s
+
+        SQXTN   v16.4h, v16.4s
+        SQXTN   v17.4h, v17.4s
+        SQXTN   v18.4h, v18.4s
+        SQXTN   v19.4h, v19.4s
+        SQXTN   v24.4h, v24.4s
+        SQXTN   v25.4h, v25.4s
+        SQXTN   v26.4h, v26.4s
+        SQXTN   v27.4h, v27.4s
+        LD1R    {v2.8h}, [x11], 2       // add bias
+
+        SQXTN2  v16.8h, v20.4s
+        SQXTN2  v17.8h, v21.4s
+        SQXTN2  v18.8h, v22.4s
+        SQXTN2  v19.8h, v23.4s
+        SQXTN2  v24.8h, v28.4s
+        SQXTN2  v25.8h, v29.4s
+        SQXTN2  v26.8h, v30.4s
+        SQXTN2  v27.8h, v31.4s
+
+        SQADD   v16.8h, v16.8h, v2.8h
+        SQADD   v17.8h, v17.8h, v2.8h
+        SQADD   v18.8h, v18.8h, v2.8h
+        SQADD   v19.8h, v19.8h, v2.8h
+        SQADD   v24.8h, v24.8h, v2.8h
+        SQADD   v25.8h, v25.8h, v2.8h
+        SQADD   v26.8h, v26.8h, v2.8h
+        SQADD   v27.8h, v27.8h, v2.8h
+        LD1R    {v0.16b}, [x11], 1      // clamp min value
+
+        SQXTN   v4.8b, v16.8h
+        SQXTN   v5.8b, v17.8h
+        SQXTN   v6.8b, v18.8h
+        SQXTN   v7.8b, v19.8h
+        LD1R    {v1.16b}, [x11]         // clamp max value
+        SQXTN2  v4.16b, v24.8h
+        SQXTN2  v5.16b, v25.8h
+        SQXTN2  v6.16b, v26.8h
+        SQXTN2  v7.16b, v27.8h
+        LDR     x12, [sp]               // cn_stride
+
+        SMAX    v4.16b, v4.16b, v0.16b
+        SMAX    v5.16b, v5.16b, v0.16b
+        SMAX    v6.16b, v6.16b, v0.16b
+        SMAX    v7.16b, v7.16b, v0.16b
+        SUBS    x1, x1, 16
+        SMIN    v4.16b, v4.16b, v1.16b
+        SMIN    v5.16b, v5.16b, v1.16b
+        SMIN    v6.16b, v6.16b, v1.16b
+        SMIN    v7.16b, v7.16b, v1.16b
+        B.LO    4f
+
+        # Store full 4 x 16
+        ST1     {v4.16b}, [x6], x12
+        SUB     x3,  x3, x2             // a0 -= kc
+        ST1     {v5.16b}, [x8], x12
+        SUB     x15, x15, x2            // a1 -= kc
+        ST1     {v6.16b}, [x9], x12
+        SUB     x13, x13, x2            // a2 -= kc
+        ST1     {v7.16b}, [x7], x12
+        SUB     x4,  x4, x2             // a3 -= kc
+        B.NE    0b
+        RET
+
+        # Remainder- 4 bytes of A
+        .p2align 3
+3:
+        LDR     s0,  [x3], 4
+        LDR     q4, [x5], 16
+        LDR     s1, [x15], 4
+        LDR     s2, [x13], 4
+        LDR     s3,  [x4], 4
+        SDOT    v16.4s, v4.16b,  v0.4b[0]
+        LDR     q5, [x5], 16
+        SDOT    v17.4s, v4.16b,  v1.4b[0]
+        SDOT    v18.4s, v4.16b,  v2.4b[0]
+        SDOT    v19.4s, v4.16b,  v3.4b[0]
+        SDOT    v20.4s, v5.16b,  v0.4b[0]
+        LDP     q6, q7, [x5], 32
+        SDOT    v21.4s, v5.16b,  v1.4b[0]
+        SDOT    v22.4s, v5.16b,  v2.4b[0]
+        SDOT    v23.4s, v5.16b,  v3.4b[0]
+        SDOT    v24.4s, v6.16b, v0.4b[0]
+        SDOT    v25.4s, v6.16b, v1.4b[0]
+        SDOT    v26.4s, v6.16b, v2.4b[0]
+        SDOT    v27.4s, v6.16b, v3.4b[0]
+        SDOT    v28.4s, v7.16b, v0.4b[0]
+        SDOT    v29.4s, v7.16b, v1.4b[0]
+        SDOT    v30.4s, v7.16b, v2.4b[0]
+        SDOT    v31.4s, v7.16b, v3.4b[0]
+        B       2b
+
+        # Store odd width
+        .p2align 3
+4:
+        TBZ     x1, 3, 5f
+        STR     d4, [x6], 8
+        DUP     d4, v4.d[1]
+        STR     d5, [x8], 8
+        DUP     d5, v5.d[1]
+        STR     d6, [x9], 8
+        DUP     d6, v6.d[1]
+        STR     d7, [x7], 8
+        DUP     d7, v7.d[1]
+5:
+        TBZ     x1, 2, 6f
+        STR     s4, [x6], 4
+        DUP     s4, v4.s[1]
+        STR     s5, [x8], 4
+        DUP     s5, v5.s[1]
+        STR     s6, [x9], 4
+        DUP     s6, v6.s[1]
+        STR     s7, [x7], 4
+        DUP     s7, v7.s[1]
+6:
+        TBZ     x1, 1, 7f
+        ST1     {v4.h}[0], [x6], 2
+        DUP     h4, v4.h[1]
+        ST1     {v5.h}[0], [x8], 2
+        DUP     h5, v5.h[1]
+        ST1     {v6.h}[0], [x9], 2
+        DUP     h6, v6.h[1]
+        ST1     {v7.h}[0], [x7], 2
+        DUP     h7, v7.h[1]
+7:
+        TBZ     x1, 0, 8f
+        ST1     {v4.b}[0], [x6]
+        ST1     {v5.b}[0], [x8]
+        ST1     {v6.b}[0], [x9]
+        ST1     {v7.b}[0], [x7]
+8:
+        RET
+
+END_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/qs8-gemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-cortex-a55.S b/src/qs8-gemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-cortex-a55.S
index 09659d3..abcb262 100644
--- a/src/qs8-gemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-cortex-a55.S
+++ b/src/qs8-gemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-cortex-a55.S
@@ -9,6 +9,7 @@
 
 #include <xnnpack/assembly.h>
 
+
 # void xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55(
 #     size_t mr,                 x0
 #     size_t nc,                 x1
@@ -19,7 +20,7 @@
 #     int8_t* restrict c,        x6
 #     size_t cm_stride,          x7
 #     size_t cn_stride,          [sp] -> x12
-#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x11
+#     const union xnn_qs8_minmax_params params)  [sp + 8] -> x11
 
 # d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
 
@@ -69,7 +70,7 @@
         LDP     q24, q28, [x5], 32
         MOV     v19.16b, v16.16b
         MOV     v21.16b, v20.16b
-        LDR     x11, [sp, 40]           // params
+        LDR     x11, [sp, 40]           // reload params
         MOV     v22.16b, v20.16b
         MOV     v23.16b, v20.16b
         MOV     v25.16b, v24.16b
@@ -408,73 +409,70 @@
         .p2align 3
 3:
         # Apply params - scale, shift, bias and clamp
-        LD1R    {v0.4s}, [x11], 4
-        SQRDMULH v4.4s, v16.4s, v0.4s
-        SQRDMULH v5.4s, v17.4s, v0.4s
-        LD1R    {v1.4s}, [x11], 4
-        SQRDMULH v6.4s, v18.4s, v0.4s
-        SQRDMULH v7.4s, v19.4s, v0.4s
-        SQRDMULH v8.4s, v20.4s, v0.4s
-        SQRDMULH v9.4s, v21.4s, v0.4s
+        LD2R    {v0.4s, v1.4s}, [x11], 8
         CMEQ    v2.4s, v1.4s, 0
-        SQRDMULH v10.4s, v22.4s, v0.4s
-        SQRDMULH v11.4s, v23.4s, v0.4s
 
-        BIC     v16.16b, v16.16b, v2.16b
-        BIC     v17.16b, v17.16b, v2.16b
-        BIC     v18.16b, v18.16b, v2.16b
-        BIC     v19.16b, v19.16b, v2.16b
-        BIC     v20.16b, v20.16b, v2.16b
-        BIC     v21.16b, v21.16b, v2.16b
-        BIC     v22.16b, v22.16b, v2.16b
-        BIC     v23.16b, v23.16b, v2.16b
+        BIC     v4.16b, v16.16b, v2.16b
+        BIC     v5.16b, v17.16b, v2.16b
+        BIC     v6.16b, v18.16b, v2.16b
+        BIC     v7.16b, v19.16b, v2.16b
 
-        SSRA    v4.4s, v16.4s, 31       // signed shift right accumulate
-        SSRA    v5.4s, v17.4s, 31
-        SSRA    v6.4s, v18.4s, 31
-        SSRA    v7.4s, v19.4s, 31
-        SSRA    v8.4s, v20.4s, 31
-        SSRA    v9.4s, v21.4s, 31
-        SSRA    v10.4s, v22.4s, 31
-        SSRA    v11.4s, v23.4s, 31
+        SQRDMULH v16.4s, v16.4s, v0.4s
+        SQRDMULH v17.4s, v17.4s, v0.4s
+        SQRDMULH v18.4s, v18.4s, v0.4s
+        SQRDMULH v19.4s, v19.4s, v0.4s
 
-        SQRDMULH v16.4s, v24.4s, v0.4s
-        SQRDMULH v17.4s, v25.4s, v0.4s
-        SQRDMULH v18.4s, v26.4s, v0.4s
-        SQRDMULH v19.4s, v27.4s, v0.4s
-        SQRDMULH v20.4s, v28.4s, v0.4s
-        SQRDMULH v21.4s, v29.4s, v0.4s
-        SQRDMULH v22.4s, v30.4s, v0.4s
-        SQRDMULH v23.4s, v31.4s, v0.4s
+        SSRA    v16.4s, v4.4s, 31       // signed shift right accumulate
+        SSRA    v17.4s, v5.4s, 31
+        SSRA    v18.4s, v6.4s, 31
+        SSRA    v19.4s, v7.4s, 31
 
-        BIC     v24.16b, v24.16b, v2.16b
-        BIC     v25.16b, v25.16b, v2.16b
-        BIC     v26.16b, v26.16b, v2.16b
-        BIC     v27.16b, v27.16b, v2.16b
-        BIC     v28.16b, v28.16b, v2.16b
-        BIC     v29.16b, v29.16b, v2.16b
-        BIC     v30.16b, v30.16b, v2.16b
-        BIC     v31.16b, v31.16b, v2.16b
+        BIC     v4.16b, v20.16b, v2.16b
+        BIC     v5.16b, v21.16b, v2.16b
+        BIC     v6.16b, v22.16b, v2.16b
+        BIC     v7.16b, v23.16b, v2.16b
 
-        SSRA    v16.4s, v24.4s, 31
-        SSRA    v17.4s, v25.4s, 31
-        SSRA    v18.4s, v26.4s, 31
-        SSRA    v19.4s, v27.4s, 31
-        SSRA    v20.4s, v28.4s, 31
-        SSRA    v21.4s, v29.4s, 31
-        SSRA    v22.4s, v30.4s, 31
-        SSRA    v23.4s, v31.4s, 31
+        SQRDMULH v20.4s, v20.4s, v0.4s
+        SQRDMULH v21.4s, v21.4s, v0.4s
+        SQRDMULH v22.4s, v22.4s, v0.4s
+        SQRDMULH v23.4s, v23.4s, v0.4s
 
-        SRSHL   v4.4s,  v4.4s, v1.4s    // signed rounding shift left
-        SRSHL   v5.4s,  v5.4s, v1.4s
-        SRSHL   v6.4s,  v6.4s, v1.4s
-        SRSHL   v7.4s,  v7.4s, v1.4s
-        SRSHL   v8.4s,  v8.4s, v1.4s
-        SRSHL   v9.4s,  v9.4s, v1.4s
-        SRSHL   v10.4s, v10.4s, v1.4s
-        SRSHL   v11.4s, v11.4s, v1.4s
+        SSRA    v20.4s, v4.4s, 31
+        SSRA    v21.4s, v5.4s, 31
+        SSRA    v22.4s, v6.4s, 31
+        SSRA    v23.4s, v7.4s, 31
 
-        SRSHL   v16.4s, v16.4s, v1.4s
+        BIC     v4.16b, v24.16b, v2.16b
+        BIC     v5.16b, v25.16b, v2.16b
+        BIC     v6.16b, v26.16b, v2.16b
+        BIC     v7.16b, v27.16b, v2.16b
+
+        SQRDMULH v24.4s, v24.4s, v0.4s
+        SQRDMULH v25.4s, v25.4s, v0.4s
+        SQRDMULH v26.4s, v26.4s, v0.4s
+        SQRDMULH v27.4s, v27.4s, v0.4s
+
+        SSRA    v24.4s, v4.4s, 31
+        SSRA    v25.4s, v5.4s, 31
+        SSRA    v26.4s, v6.4s, 31
+        SSRA    v27.4s, v7.4s, 31
+
+        BIC     v4.16b, v28.16b, v2.16b
+        BIC     v5.16b, v29.16b, v2.16b
+        BIC     v6.16b, v30.16b, v2.16b
+        BIC     v7.16b, v31.16b, v2.16b
+
+        SQRDMULH v28.4s, v28.4s, v0.4s
+        SQRDMULH v29.4s, v29.4s, v0.4s
+        SQRDMULH v30.4s, v30.4s, v0.4s
+        SQRDMULH v31.4s, v31.4s, v0.4s
+
+        SSRA    v28.4s, v4.4s, 31
+        SSRA    v29.4s, v5.4s, 31
+        SSRA    v30.4s, v6.4s, 31
+        SSRA    v31.4s, v7.4s, 31
+
+        SRSHL   v16.4s, v16.4s, v1.4s   // signed rounding shift left
         SRSHL   v17.4s, v17.4s, v1.4s
         SRSHL   v18.4s, v18.4s, v1.4s
         SRSHL   v19.4s, v19.4s, v1.4s
@@ -482,56 +480,64 @@
         SRSHL   v21.4s, v21.4s, v1.4s
         SRSHL   v22.4s, v22.4s, v1.4s
         SRSHL   v23.4s, v23.4s, v1.4s
+        SRSHL   v24.4s, v24.4s, v1.4s
+        SRSHL   v25.4s, v25.4s, v1.4s
+        SRSHL   v26.4s, v26.4s, v1.4s
+        SRSHL   v27.4s, v27.4s, v1.4s
+        SRSHL   v28.4s, v28.4s, v1.4s
+        SRSHL   v29.4s, v29.4s, v1.4s
+        SRSHL   v30.4s, v30.4s, v1.4s
+        SRSHL   v31.4s, v31.4s, v1.4s
 
-        SQXTN   v4.4h,  v4.4s
-        SQXTN   v5.4h,  v5.4s
-        SQXTN   v6.4h,  v6.4s
-        SQXTN   v7.4h,  v7.4s
         SQXTN   v16.4h, v16.4s
         SQXTN   v17.4h, v17.4s
         SQXTN   v18.4h, v18.4s
         SQXTN   v19.4h, v19.4s
+        SQXTN   v24.4h, v24.4s
+        SQXTN   v25.4h, v25.4s
+        SQXTN   v26.4h, v26.4s
+        SQXTN   v27.4h, v27.4s
         LD1R    {v2.8h}, [x11], 2       // add bias
 
-        SQXTN2  v4.8h,  v8.4s
-        SQXTN2  v5.8h,  v9.4s
-        SQXTN2  v6.8h, v10.4s
-        SQXTN2  v7.8h, v11.4s
         SQXTN2  v16.8h, v20.4s
         SQXTN2  v17.8h, v21.4s
         SQXTN2  v18.8h, v22.4s
         SQXTN2  v19.8h, v23.4s
+        SQXTN2  v24.8h, v28.4s
+        SQXTN2  v25.8h, v29.4s
+        SQXTN2  v26.8h, v30.4s
+        SQXTN2  v27.8h, v31.4s
 
-        SQADD   v4.8h,  v4.8h, v2.8h
-        SQADD   v5.8h,  v5.8h, v2.8h
-        SQADD   v6.8h,  v6.8h, v2.8h
-        SQADD   v7.8h,  v7.8h, v2.8h
         SQADD   v16.8h, v16.8h, v2.8h
         SQADD   v17.8h, v17.8h, v2.8h
         SQADD   v18.8h, v18.8h, v2.8h
         SQADD   v19.8h, v19.8h, v2.8h
+        SQADD   v24.8h, v24.8h, v2.8h
+        SQADD   v25.8h, v25.8h, v2.8h
+        SQADD   v26.8h, v26.8h, v2.8h
+        SQADD   v27.8h, v27.8h, v2.8h
         LD1R    {v0.16b}, [x11], 1      // clamp min value
 
-        SQXTN   v4.8b,  v4.8h
-        SQXTN   v5.8b,  v5.8h
-        SQXTN   v6.8b,  v6.8h
-        SQXTN   v7.8b,  v7.8h
+        SQXTN   v4.8b, v16.8h
+        SQXTN   v5.8b, v17.8h
+        SQXTN   v6.8b, v18.8h
+        SQXTN   v7.8b, v19.8h
         LD1R    {v1.16b}, [x11]         // clamp max value
-        SQXTN2  v4.16b, v16.8h
-        SQXTN2  v5.16b, v17.8h
-        SQXTN2  v6.16b, v18.8h
-        SQXTN2  v7.16b, v19.8h
+        SQXTN2  v4.16b, v24.8h
+        SQXTN2  v5.16b, v25.8h
+        SQXTN2  v6.16b, v26.8h
+        SQXTN2  v7.16b, v27.8h
         LDR     x12, [sp, 32]           // cn_stride
 
-        SMAX    v4.16b,  v4.16b, v0.16b
-        SMAX    v5.16b,  v5.16b, v0.16b
-        SMAX    v6.16b,  v6.16b, v0.16b
-        SMAX    v7.16b,  v7.16b, v0.16b
+        SMAX    v4.16b, v4.16b, v0.16b
+        SMAX    v5.16b, v5.16b, v0.16b
+        SMAX    v6.16b, v6.16b, v0.16b
+        SMAX    v7.16b, v7.16b, v0.16b
         SUBS    x1, x1, 16
-        SMIN    v4.16b,  v4.16b, v1.16b
-        SMIN    v5.16b,  v5.16b, v1.16b
-        SMIN    v6.16b,  v6.16b, v1.16b
-        SMIN    v7.16b,  v7.16b, v1.16b
+        SMIN    v4.16b, v4.16b, v1.16b
+        SMIN    v5.16b, v5.16b, v1.16b
+        SMIN    v6.16b, v6.16b, v1.16b
+        SMIN    v7.16b, v7.16b, v1.16b
         B.LO    6f
 
         # Store full 4 x 16
diff --git a/src/qs8-gemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld128.S b/src/qs8-gemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld128.S
index e8a6d02..55ffe3f 100644
--- a/src/qs8-gemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld128.S
+++ b/src/qs8-gemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld128.S
@@ -22,7 +22,7 @@
 #     size_t cn_stride,          [sp] -> x12
 #     const union xnn_qs8_minmax_params params)  [sp + 8] -> x11
 
-# params structure is 11 bytes
+# params structure is 12 bytes
 #  struct {
 #    int32_t multiplier;
 #    int32_t right_shift;
diff --git a/src/qs8-gemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld32.S b/src/qs8-gemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld32.S
index 75409e7..f4eac7d 100644
--- a/src/qs8-gemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld32.S
+++ b/src/qs8-gemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld32.S
@@ -9,6 +9,7 @@
 
 #include <xnnpack/assembly.h>
 
+
 # void xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32(
 #     size_t mr,                 x0
 #     size_t nc,                 x1
@@ -19,7 +20,7 @@
 #     int8_t* restrict c,        x6
 #     size_t cm_stride,          x7
 #     size_t cn_stride,          [sp] -> x12
-#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x11
+#     const union xnn_qs8_minmax_params params)  [sp + 8] -> x11
 
 # d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
 
@@ -67,7 +68,7 @@
         LDP     q24, q28, [x5], 32
         MOV     v19.16b, v16.16b
         MOV     v21.16b, v20.16b
-        LDR     x11, [sp, 8]            // params
+        LDR     x11, [sp, 8]            // reload params
         MOV     v22.16b, v20.16b
         MOV     v23.16b, v20.16b
         MOV     x0, x2                  // k = kc.  assumes kc > 0
diff --git a/src/qs8-gemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld64.S b/src/qs8-gemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld64.S
index d5d5df8..4d482ac 100644
--- a/src/qs8-gemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld64.S
+++ b/src/qs8-gemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld64.S
@@ -9,6 +9,7 @@
 
 #include <xnnpack/assembly.h>
 
+
 # void xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64(
 #     size_t mr,                 x0
 #     size_t nc,                 x1
@@ -19,7 +20,7 @@
 #     int8_t* restrict c,        x6
 #     size_t cm_stride,          x7
 #     size_t cn_stride,          [sp] -> x12
-#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x11
+#     const union xnn_qs8_minmax_params params)  [sp + 8] -> x11
 
 # d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
 
diff --git a/src/qs8-igemm/4x16c4-aarch64-neondot-cortex-a55.S.in b/src/qs8-igemm/4x16c4-aarch64-neondot-cortex-a55.S.in
index 972e930..c6b61ef 100644
--- a/src/qs8-igemm/4x16c4-aarch64-neondot-cortex-a55.S.in
+++ b/src/qs8-igemm/4x16c4-aarch64-neondot-cortex-a55.S.in
@@ -5,7 +5,10 @@
 
 #include <xnnpack/assembly.h>
 
-# void xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55(
+$assert REQUANTIZATION in ["GEMMLOWP", "FP32"]
+$CONV_PARAMS = "xnn_qs8_minmax_params"
+
+# void xnn_qs8_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_cortex_a55(
 #     size_t mr,                 x0
 #     size_t nc,                 x1
 #     size_t kc,                 x2 / x0
@@ -17,7 +20,7 @@
 #     size_t cn_stride,                  [sp] -> (x0)
 #     size_t a_offset,                   [sp + 8] -> x11
 #     const float* zero,                 [sp + 16] -> x12
-#     const xnn_qs8_conv_minmax_params params [sp + 24] -> (x8)
+#     const union ${CONV_PARAMS} params [sp + 24] -> (x8)
 
 # d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
 
@@ -35,7 +38,7 @@
 
 # x8 temp for Cortex-A55 loads
 
-BEGIN_FUNCTION xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55
+BEGIN_FUNCTION xnn_qs8_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_cortex_a55
 
         # Clamp C pointers
         CMP     x0, 2                   // if mr < 2
@@ -428,132 +431,194 @@
         SUBS    x9, x9, 32              // ks -= MR * sizeof(int8_t*)
         B.HI    1b
 
-        # Apply params - scale, shift, bias and clamp
-        LD1R    {v0.4s}, [x8], 4
-        SQRDMULH v4.4s, v16.4s, v0.4s
-        SQRDMULH v5.4s, v17.4s, v0.4s
-        LD1R    {v1.4s}, [x8], 4
-        SQRDMULH v6.4s, v18.4s, v0.4s
-        SQRDMULH v7.4s, v19.4s, v0.4s
-        SQRDMULH v8.4s, v20.4s, v0.4s
-        SQRDMULH v9.4s, v21.4s, v0.4s
-        CMEQ    v2.4s, v1.4s, 0
-        SQRDMULH v10.4s, v22.4s, v0.4s
-        SQRDMULH v11.4s, v23.4s, v0.4s
+        $if REQUANTIZATION == "GEMMLOWP":
+          # Apply params - scale, shift, bias and clamp
+          LD2R    {v0.4s, v1.4s}, [x8], 8
+          CMEQ    v2.4s, v1.4s, 0
 
-        BIC     v16.16b, v16.16b, v2.16b
-        BIC     v17.16b, v17.16b, v2.16b
-        BIC     v18.16b, v18.16b, v2.16b
-        BIC     v19.16b, v19.16b, v2.16b
-        BIC     v20.16b, v20.16b, v2.16b
-        BIC     v21.16b, v21.16b, v2.16b
-        BIC     v22.16b, v22.16b, v2.16b
-        BIC     v23.16b, v23.16b, v2.16b
+          BIC     v4.16b, v16.16b, v2.16b
+          BIC     v5.16b, v17.16b, v2.16b
+          BIC     v6.16b, v18.16b, v2.16b
+          BIC     v7.16b, v19.16b, v2.16b
 
-        SSRA    v4.4s, v16.4s, 31       // signed shift right accumulate
-        SSRA    v5.4s, v17.4s, 31
-        SSRA    v6.4s, v18.4s, 31
-        SSRA    v7.4s, v19.4s, 31
-        SSRA    v8.4s, v20.4s, 31
-        SSRA    v9.4s, v21.4s, 31
-        SSRA    v10.4s, v22.4s, 31
-        SSRA    v11.4s, v23.4s, 31
+          SQRDMULH v16.4s, v16.4s, v0.4s
+          SQRDMULH v17.4s, v17.4s, v0.4s
+          SQRDMULH v18.4s, v18.4s, v0.4s
+          SQRDMULH v19.4s, v19.4s, v0.4s
 
-        SQRDMULH v16.4s, v24.4s, v0.4s
-        SQRDMULH v17.4s, v25.4s, v0.4s
-        SQRDMULH v18.4s, v26.4s, v0.4s
-        SQRDMULH v19.4s, v27.4s, v0.4s
-        SQRDMULH v20.4s, v28.4s, v0.4s
-        SQRDMULH v21.4s, v29.4s, v0.4s
-        SQRDMULH v22.4s, v30.4s, v0.4s
-        SQRDMULH v23.4s, v31.4s, v0.4s
+          SSRA    v16.4s, v4.4s, 31       // signed shift right accumulate
+          SSRA    v17.4s, v5.4s, 31
+          SSRA    v18.4s, v6.4s, 31
+          SSRA    v19.4s, v7.4s, 31
 
-        BIC     v24.16b, v24.16b, v2.16b
-        BIC     v25.16b, v25.16b, v2.16b
-        BIC     v26.16b, v26.16b, v2.16b
-        BIC     v27.16b, v27.16b, v2.16b
-        BIC     v28.16b, v28.16b, v2.16b
-        BIC     v29.16b, v29.16b, v2.16b
-        BIC     v30.16b, v30.16b, v2.16b
-        BIC     v31.16b, v31.16b, v2.16b
+          BIC     v4.16b, v20.16b, v2.16b
+          BIC     v5.16b, v21.16b, v2.16b
+          BIC     v6.16b, v22.16b, v2.16b
+          BIC     v7.16b, v23.16b, v2.16b
 
-        SSRA    v16.4s, v24.4s, 31
-        SSRA    v17.4s, v25.4s, 31
-        SSRA    v18.4s, v26.4s, 31
-        SSRA    v19.4s, v27.4s, 31
-        SSRA    v20.4s, v28.4s, 31
-        SSRA    v21.4s, v29.4s, 31
-        SSRA    v22.4s, v30.4s, 31
-        SSRA    v23.4s, v31.4s, 31
+          SQRDMULH v20.4s, v20.4s, v0.4s
+          SQRDMULH v21.4s, v21.4s, v0.4s
+          SQRDMULH v22.4s, v22.4s, v0.4s
+          SQRDMULH v23.4s, v23.4s, v0.4s
 
-        SRSHL   v4.4s,  v4.4s, v1.4s    // signed rounding shift left
-        SRSHL   v5.4s,  v5.4s, v1.4s
-        SRSHL   v6.4s,  v6.4s, v1.4s
-        SRSHL   v7.4s,  v7.4s, v1.4s
-        SRSHL   v8.4s,  v8.4s, v1.4s
-        SRSHL   v9.4s,  v9.4s, v1.4s
-        SRSHL   v10.4s, v10.4s, v1.4s
-        SRSHL   v11.4s, v11.4s, v1.4s
+          SSRA    v20.4s, v4.4s, 31
+          SSRA    v21.4s, v5.4s, 31
+          SSRA    v22.4s, v6.4s, 31
+          SSRA    v23.4s, v7.4s, 31
 
-        SRSHL   v16.4s, v16.4s, v1.4s
-        SRSHL   v17.4s, v17.4s, v1.4s
-        SRSHL   v18.4s, v18.4s, v1.4s
-        SRSHL   v19.4s, v19.4s, v1.4s
-        SRSHL   v20.4s, v20.4s, v1.4s
-        SRSHL   v21.4s, v21.4s, v1.4s
-        SRSHL   v22.4s, v22.4s, v1.4s
-        SRSHL   v23.4s, v23.4s, v1.4s
+          BIC     v4.16b, v24.16b, v2.16b
+          BIC     v5.16b, v25.16b, v2.16b
+          BIC     v6.16b, v26.16b, v2.16b
+          BIC     v7.16b, v27.16b, v2.16b
 
-        SQXTN   v4.4h,  v4.4s
-        SQXTN   v5.4h,  v5.4s
-        SQXTN   v6.4h,  v6.4s
-        SQXTN   v7.4h,  v7.4s
+          SQRDMULH v24.4s, v24.4s, v0.4s
+          SQRDMULH v25.4s, v25.4s, v0.4s
+          SQRDMULH v26.4s, v26.4s, v0.4s
+          SQRDMULH v27.4s, v27.4s, v0.4s
+
+          SSRA    v24.4s, v4.4s, 31
+          SSRA    v25.4s, v5.4s, 31
+          SSRA    v26.4s, v6.4s, 31
+          SSRA    v27.4s, v7.4s, 31
+
+          BIC     v4.16b, v28.16b, v2.16b
+          BIC     v5.16b, v29.16b, v2.16b
+          BIC     v6.16b, v30.16b, v2.16b
+          BIC     v7.16b, v31.16b, v2.16b
+
+          SQRDMULH v28.4s, v28.4s, v0.4s
+          SQRDMULH v29.4s, v29.4s, v0.4s
+          SQRDMULH v30.4s, v30.4s, v0.4s
+          SQRDMULH v31.4s, v31.4s, v0.4s
+
+          SSRA    v28.4s, v4.4s, 31
+          SSRA    v29.4s, v5.4s, 31
+          SSRA    v30.4s, v6.4s, 31
+          SSRA    v31.4s, v7.4s, 31
+
+          SRSHL   v16.4s, v16.4s, v1.4s   // signed rounding shift left
+          SRSHL   v17.4s, v17.4s, v1.4s
+          SRSHL   v18.4s, v18.4s, v1.4s
+          SRSHL   v19.4s, v19.4s, v1.4s
+          SRSHL   v20.4s, v20.4s, v1.4s
+          SRSHL   v21.4s, v21.4s, v1.4s
+          SRSHL   v22.4s, v22.4s, v1.4s
+          SRSHL   v23.4s, v23.4s, v1.4s
+          SRSHL   v24.4s, v24.4s, v1.4s
+          SRSHL   v25.4s, v25.4s, v1.4s
+          SRSHL   v26.4s, v26.4s, v1.4s
+          SRSHL   v27.4s, v27.4s, v1.4s
+          SRSHL   v28.4s, v28.4s, v1.4s
+          SRSHL   v29.4s, v29.4s, v1.4s
+          SRSHL   v30.4s, v30.4s, v1.4s
+          SRSHL   v31.4s, v31.4s, v1.4s
+        $elif REQUANTIZATION == "FP32":
+          # Apply params - scale, bias and clamp
+          LD1R    {v0.4s}, [x8], 4
+
+          SCVTF   v16.4s, v16.4s
+          SCVTF   v17.4s, v17.4s
+          SCVTF   v18.4s, v18.4s
+          SCVTF   v19.4s, v19.4s
+          SCVTF   v20.4s, v20.4s
+          SCVTF   v21.4s, v21.4s
+          SCVTF   v22.4s, v22.4s
+          SCVTF   v23.4s, v23.4s
+          SCVTF   v24.4s, v24.4s
+          SCVTF   v25.4s, v25.4s
+          SCVTF   v26.4s, v26.4s
+          SCVTF   v27.4s, v27.4s
+          SCVTF   v28.4s, v28.4s
+          SCVTF   v29.4s, v29.4s
+          SCVTF   v30.4s, v30.4s
+          SCVTF   v31.4s, v31.4s
+
+          FMUL    v16.4s, v0.4s, v16.4s
+          FMUL    v17.4s, v0.4s, v17.4s
+          FMUL    v18.4s, v0.4s, v18.4s
+          FMUL    v19.4s, v0.4s, v19.4s
+          FMUL    v20.4s, v0.4s, v20.4s
+          FMUL    v21.4s, v0.4s, v21.4s
+          FMUL    v22.4s, v0.4s, v22.4s
+          FMUL    v23.4s, v0.4s, v23.4s
+          FMUL    v24.4s, v0.4s, v24.4s
+          FMUL    v25.4s, v0.4s, v25.4s
+          FMUL    v26.4s, v0.4s, v26.4s
+          FMUL    v27.4s, v0.4s, v27.4s
+          FMUL    v28.4s, v0.4s, v28.4s
+          FMUL    v29.4s, v0.4s, v29.4s
+          FMUL    v30.4s, v0.4s, v30.4s
+          FMUL    v31.4s, v0.4s, v31.4s
+
+          FCVTNS  v16.4s, v16.4s
+          FCVTNS  v17.4s, v17.4s
+          FCVTNS  v18.4s, v18.4s
+          FCVTNS  v19.4s, v19.4s
+          FCVTNS  v20.4s, v20.4s
+          FCVTNS  v21.4s, v21.4s
+          FCVTNS  v22.4s, v22.4s
+          FCVTNS  v23.4s, v23.4s
+          FCVTNS  v24.4s, v24.4s
+          FCVTNS  v25.4s, v25.4s
+          FCVTNS  v26.4s, v26.4s
+          FCVTNS  v27.4s, v27.4s
+          FCVTNS  v28.4s, v28.4s
+          FCVTNS  v29.4s, v29.4s
+          FCVTNS  v30.4s, v30.4s
+          FCVTNS  v31.4s, v31.4s
+
         SQXTN   v16.4h, v16.4s
         SQXTN   v17.4h, v17.4s
         SQXTN   v18.4h, v18.4s
         SQXTN   v19.4h, v19.4s
-        LD1R    {v2.8h}, [x8], 2        // add bias
+        SQXTN   v24.4h, v24.4s
+        SQXTN   v25.4h, v25.4s
+        SQXTN   v26.4h, v26.4s
+        SQXTN   v27.4h, v27.4s
+        LD1R    {v2.8h}, [x8], 2       // add bias
 
-        SQXTN2  v4.8h,  v8.4s
-        SQXTN2  v5.8h,  v9.4s
-        SQXTN2  v6.8h, v10.4s
-        SQXTN2  v7.8h, v11.4s
         SQXTN2  v16.8h, v20.4s
         SQXTN2  v17.8h, v21.4s
         SQXTN2  v18.8h, v22.4s
         SQXTN2  v19.8h, v23.4s
+        SQXTN2  v24.8h, v28.4s
+        SQXTN2  v25.8h, v29.4s
+        SQXTN2  v26.8h, v30.4s
+        SQXTN2  v27.8h, v31.4s
 
-        SQADD   v4.8h,  v4.8h, v2.8h
-        SQADD   v5.8h,  v5.8h, v2.8h
-        SQADD   v6.8h,  v6.8h, v2.8h
-        SQADD   v7.8h,  v7.8h, v2.8h
         SQADD   v16.8h, v16.8h, v2.8h
         SQADD   v17.8h, v17.8h, v2.8h
         SQADD   v18.8h, v18.8h, v2.8h
         SQADD   v19.8h, v19.8h, v2.8h
-        LD1R    {v0.16b}, [x8], 1       // clamp min value
+        SQADD   v24.8h, v24.8h, v2.8h
+        SQADD   v25.8h, v25.8h, v2.8h
+        SQADD   v26.8h, v26.8h, v2.8h
+        SQADD   v27.8h, v27.8h, v2.8h
+        LD1R    {v0.16b}, [x8], 1      // clamp min value
 
-        SQXTN   v4.8b,  v4.8h
-        SQXTN   v5.8b,  v5.8h
-        SQXTN   v6.8b,  v6.8h
-        SQXTN   v7.8b,  v7.8h
-        LD1R    {v1.16b}, [x8]          // clamp max value
-        SQXTN2  v4.16b, v16.8h
-        SQXTN2  v5.16b, v17.8h
-        SQXTN2  v6.16b, v18.8h
-        SQXTN2  v7.16b, v19.8h
-        SUB     x8, x8, 11              // rewind params pointer
-
-        SMAX    v4.16b,  v4.16b, v0.16b
-        SMAX    v5.16b,  v5.16b, v0.16b
-        LDR     x0, [sp, 32]            // Load cn_stride
-        SMAX    v6.16b,  v6.16b, v0.16b
-        SMAX    v7.16b,  v7.16b, v0.16b
+        SQXTN   v4.8b, v16.8h
+        SQXTN   v5.8b, v17.8h
+        SQXTN   v6.8b, v18.8h
+        SQXTN   v7.8b, v19.8h
+        LD1R    {v1.16b}, [x8]         // clamp max value
+        SQXTN2  v4.16b, v24.8h
+        SQXTN2  v5.16b, v25.8h
+        SQXTN2  v6.16b, v26.8h
+        SQXTN2  v7.16b, v27.8h
+        $if REQUANTIZATION == "GEMMLOWP":
+          SUB     x8, x8, 11              // rewind params pointer
+        $elif REQUANTIZATION == "FP32":
+          SUB     x8, x8, 7               // rewind params pointer
+        SMAX    v4.16b, v4.16b, v0.16b
+        SMAX    v5.16b, v5.16b, v0.16b
+        LDR     x0, [sp, 32]           // cn_stride
+        SMAX    v6.16b, v6.16b, v0.16b
+        SMAX    v7.16b, v7.16b, v0.16b
         SUBS    x1, x1, 16
-        SMIN    v4.16b,  v4.16b, v1.16b
-        SMIN    v5.16b,  v5.16b, v1.16b
-        SMIN    v6.16b,  v6.16b, v1.16b
-        SMIN    v7.16b,  v7.16b, v1.16b
+        SMIN    v4.16b, v4.16b, v1.16b
+        SMIN    v5.16b, v5.16b, v1.16b
+        SMIN    v6.16b, v6.16b, v1.16b
+        SMIN    v7.16b, v7.16b, v1.16b
         B.LO    7f
 
         # Store full 4 x 16
@@ -686,7 +751,7 @@
         LDP     d8,  d9, [sp], 32
         RET
 
-END_FUNCTION xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55
+END_FUNCTION xnn_qs8_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_cortex_a55
 
 #ifdef __ELF__
 .section ".note.GNU-stack","",%progbits
diff --git a/src/qs8-igemm/4x16c4-aarch64-neondot-ld128.S.in b/src/qs8-igemm/4x16c4-aarch64-neondot-ld128.S.in
index 879d8e6..a8f8577 100644
--- a/src/qs8-igemm/4x16c4-aarch64-neondot-ld128.S.in
+++ b/src/qs8-igemm/4x16c4-aarch64-neondot-ld128.S.in
@@ -5,7 +5,10 @@
 
 #include <xnnpack/assembly.h>
 
-# void xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128(
+$assert REQUANTIZATION in ["GEMMLOWP", "FP32"]
+$CONV_PARAMS = "xnn_qs8_minmax_params"
+
+# void xnn_qs8_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_ld128(
 #     size_t mr,                 x0
 #     size_t nc,                 x1
 #     size_t kc,                 x2 / x0
@@ -17,7 +20,7 @@
 #     size_t cn_stride,                  [sp] -> x10
 #     size_t a_offset,                   [sp + 8] -> x11
 #     const float* zero,                 [sp + 16] -> x12
-#     const xnn_qs8_conv_minmax_params params [sp + 24] -> (x8)
+#     const union ${CONV_PARAMS} params [sp + 24] -> (x8)
 
 # d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
 
@@ -33,7 +36,7 @@
 # C3   x7 v19 v23 v27 v31
 # unused v8 v9 v10 v11 v12 v13 v14 v15
 
-BEGIN_FUNCTION xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128
+BEGIN_FUNCTION xnn_qs8_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_ld128
 
         # Clamp C pointers
         CMP     x0, 2                   // if mr < 2
@@ -191,86 +194,142 @@
 
         LDR     x8, [sp, 24]            // reload params pointer
 
-        # Apply params - scale, shift, bias and clamp
-        LD2R    {v0.4s, v1.4s}, [x8], 8
-        CMEQ    v2.4s, v1.4s, 0
+        $if REQUANTIZATION == "GEMMLOWP":
+          # Apply params - scale, shift, bias and clamp
+          LD2R    {v0.4s, v1.4s}, [x8], 8
+          CMEQ    v2.4s, v1.4s, 0
 
-        BIC     v4.16b, v16.16b, v2.16b
-        BIC     v5.16b, v17.16b, v2.16b
-        BIC     v6.16b, v18.16b, v2.16b
-        BIC     v7.16b, v19.16b, v2.16b
+          BIC     v4.16b, v16.16b, v2.16b
+          BIC     v5.16b, v17.16b, v2.16b
+          BIC     v6.16b, v18.16b, v2.16b
+          BIC     v7.16b, v19.16b, v2.16b
 
-        SQRDMULH v16.4s, v16.4s, v0.4s
-        SQRDMULH v17.4s, v17.4s, v0.4s
-        SQRDMULH v18.4s, v18.4s, v0.4s
-        SQRDMULH v19.4s, v19.4s, v0.4s
+          SQRDMULH v16.4s, v16.4s, v0.4s
+          SQRDMULH v17.4s, v17.4s, v0.4s
+          SQRDMULH v18.4s, v18.4s, v0.4s
+          SQRDMULH v19.4s, v19.4s, v0.4s
 
-        SSRA    v16.4s, v4.4s, 31       // signed shift right accumulate
-        SSRA    v17.4s, v5.4s, 31
-        SSRA    v18.4s, v6.4s, 31
-        SSRA    v19.4s, v7.4s, 31
+          SSRA    v16.4s, v4.4s, 31       // signed shift right accumulate
+          SSRA    v17.4s, v5.4s, 31
+          SSRA    v18.4s, v6.4s, 31
+          SSRA    v19.4s, v7.4s, 31
 
-        BIC     v4.16b, v20.16b, v2.16b
-        BIC     v5.16b, v21.16b, v2.16b
-        BIC     v6.16b, v22.16b, v2.16b
-        BIC     v7.16b, v23.16b, v2.16b
+          BIC     v4.16b, v20.16b, v2.16b
+          BIC     v5.16b, v21.16b, v2.16b
+          BIC     v6.16b, v22.16b, v2.16b
+          BIC     v7.16b, v23.16b, v2.16b
 
-        SQRDMULH v20.4s, v20.4s, v0.4s
-        SQRDMULH v21.4s, v21.4s, v0.4s
-        SQRDMULH v22.4s, v22.4s, v0.4s
-        SQRDMULH v23.4s, v23.4s, v0.4s
+          SQRDMULH v20.4s, v20.4s, v0.4s
+          SQRDMULH v21.4s, v21.4s, v0.4s
+          SQRDMULH v22.4s, v22.4s, v0.4s
+          SQRDMULH v23.4s, v23.4s, v0.4s
 
-        SSRA    v20.4s, v4.4s, 31
-        SSRA    v21.4s, v5.4s, 31
-        SSRA    v22.4s, v6.4s, 31
-        SSRA    v23.4s, v7.4s, 31
+          SSRA    v20.4s, v4.4s, 31
+          SSRA    v21.4s, v5.4s, 31
+          SSRA    v22.4s, v6.4s, 31
+          SSRA    v23.4s, v7.4s, 31
 
-        BIC     v4.16b, v24.16b, v2.16b
-        BIC     v5.16b, v25.16b, v2.16b
-        BIC     v6.16b, v26.16b, v2.16b
-        BIC     v7.16b, v27.16b, v2.16b
+          BIC     v4.16b, v24.16b, v2.16b
+          BIC     v5.16b, v25.16b, v2.16b
+          BIC     v6.16b, v26.16b, v2.16b
+          BIC     v7.16b, v27.16b, v2.16b
 
-        SQRDMULH v24.4s, v24.4s, v0.4s
-        SQRDMULH v25.4s, v25.4s, v0.4s
-        SQRDMULH v26.4s, v26.4s, v0.4s
-        SQRDMULH v27.4s, v27.4s, v0.4s
+          SQRDMULH v24.4s, v24.4s, v0.4s
+          SQRDMULH v25.4s, v25.4s, v0.4s
+          SQRDMULH v26.4s, v26.4s, v0.4s
+          SQRDMULH v27.4s, v27.4s, v0.4s
 
-        SSRA    v24.4s, v4.4s, 31
-        SSRA    v25.4s, v5.4s, 31
-        SSRA    v26.4s, v6.4s, 31
-        SSRA    v27.4s, v7.4s, 31
+          SSRA    v24.4s, v4.4s, 31
+          SSRA    v25.4s, v5.4s, 31
+          SSRA    v26.4s, v6.4s, 31
+          SSRA    v27.4s, v7.4s, 31
 
-        BIC     v4.16b, v28.16b, v2.16b
-        BIC     v5.16b, v29.16b, v2.16b
-        BIC     v6.16b, v30.16b, v2.16b
-        BIC     v7.16b, v31.16b, v2.16b
+          BIC     v4.16b, v28.16b, v2.16b
+          BIC     v5.16b, v29.16b, v2.16b
+          BIC     v6.16b, v30.16b, v2.16b
+          BIC     v7.16b, v31.16b, v2.16b
 
-        SQRDMULH v28.4s, v28.4s, v0.4s
-        SQRDMULH v29.4s, v29.4s, v0.4s
-        SQRDMULH v30.4s, v30.4s, v0.4s
-        SQRDMULH v31.4s, v31.4s, v0.4s
+          SQRDMULH v28.4s, v28.4s, v0.4s
+          SQRDMULH v29.4s, v29.4s, v0.4s
+          SQRDMULH v30.4s, v30.4s, v0.4s
+          SQRDMULH v31.4s, v31.4s, v0.4s
 
-        SSRA    v28.4s, v4.4s, 31
-        SSRA    v29.4s, v5.4s, 31
-        SSRA    v30.4s, v6.4s, 31
-        SSRA    v31.4s, v7.4s, 31
+          SSRA    v28.4s, v4.4s, 31
+          SSRA    v29.4s, v5.4s, 31
+          SSRA    v30.4s, v6.4s, 31
+          SSRA    v31.4s, v7.4s, 31
 
-        SRSHL   v16.4s, v16.4s, v1.4s   // signed rounding shift left
-        SRSHL   v17.4s, v17.4s, v1.4s
-        SRSHL   v18.4s, v18.4s, v1.4s
-        SRSHL   v19.4s, v19.4s, v1.4s
-        SRSHL   v20.4s, v20.4s, v1.4s
-        SRSHL   v21.4s, v21.4s, v1.4s
-        SRSHL   v22.4s, v22.4s, v1.4s
-        SRSHL   v23.4s, v23.4s, v1.4s
-        SRSHL   v24.4s, v24.4s, v1.4s
-        SRSHL   v25.4s, v25.4s, v1.4s
-        SRSHL   v26.4s, v26.4s, v1.4s
-        SRSHL   v27.4s, v27.4s, v1.4s
-        SRSHL   v28.4s, v28.4s, v1.4s
-        SRSHL   v29.4s, v29.4s, v1.4s
-        SRSHL   v30.4s, v30.4s, v1.4s
-        SRSHL   v31.4s, v31.4s, v1.4s
+          SRSHL   v16.4s, v16.4s, v1.4s   // signed rounding shift left
+          SRSHL   v17.4s, v17.4s, v1.4s
+          SRSHL   v18.4s, v18.4s, v1.4s
+          SRSHL   v19.4s, v19.4s, v1.4s
+          SRSHL   v20.4s, v20.4s, v1.4s
+          SRSHL   v21.4s, v21.4s, v1.4s
+          SRSHL   v22.4s, v22.4s, v1.4s
+          SRSHL   v23.4s, v23.4s, v1.4s
+          SRSHL   v24.4s, v24.4s, v1.4s
+          SRSHL   v25.4s, v25.4s, v1.4s
+          SRSHL   v26.4s, v26.4s, v1.4s
+          SRSHL   v27.4s, v27.4s, v1.4s
+          SRSHL   v28.4s, v28.4s, v1.4s
+          SRSHL   v29.4s, v29.4s, v1.4s
+          SRSHL   v30.4s, v30.4s, v1.4s
+          SRSHL   v31.4s, v31.4s, v1.4s
+        $elif REQUANTIZATION == "FP32":
+          # Apply params - scale, bias and clamp
+          LD1R    {v0.4s}, [x8], 4
+
+          SCVTF   v16.4s, v16.4s
+          SCVTF   v17.4s, v17.4s
+          SCVTF   v18.4s, v18.4s
+          SCVTF   v19.4s, v19.4s
+          SCVTF   v20.4s, v20.4s
+          SCVTF   v21.4s, v21.4s
+          SCVTF   v22.4s, v22.4s
+          SCVTF   v23.4s, v23.4s
+          SCVTF   v24.4s, v24.4s
+          SCVTF   v25.4s, v25.4s
+          SCVTF   v26.4s, v26.4s
+          SCVTF   v27.4s, v27.4s
+          SCVTF   v28.4s, v28.4s
+          SCVTF   v29.4s, v29.4s
+          SCVTF   v30.4s, v30.4s
+          SCVTF   v31.4s, v31.4s
+
+          FMUL    v16.4s, v0.4s, v16.4s
+          FMUL    v17.4s, v0.4s, v17.4s
+          FMUL    v18.4s, v0.4s, v18.4s
+          FMUL    v19.4s, v0.4s, v19.4s
+          FMUL    v20.4s, v0.4s, v20.4s
+          FMUL    v21.4s, v0.4s, v21.4s
+          FMUL    v22.4s, v0.4s, v22.4s
+          FMUL    v23.4s, v0.4s, v23.4s
+          FMUL    v24.4s, v0.4s, v24.4s
+          FMUL    v25.4s, v0.4s, v25.4s
+          FMUL    v26.4s, v0.4s, v26.4s
+          FMUL    v27.4s, v0.4s, v27.4s
+          FMUL    v28.4s, v0.4s, v28.4s
+          FMUL    v29.4s, v0.4s, v29.4s
+          FMUL    v30.4s, v0.4s, v30.4s
+          FMUL    v31.4s, v0.4s, v31.4s
+
+          FCVTNS  v16.4s, v16.4s
+          FCVTNS  v17.4s, v17.4s
+          FCVTNS  v18.4s, v18.4s
+          FCVTNS  v19.4s, v19.4s
+          FCVTNS  v20.4s, v20.4s
+          FCVTNS  v21.4s, v21.4s
+          FCVTNS  v22.4s, v22.4s
+          FCVTNS  v23.4s, v23.4s
+          FCVTNS  v24.4s, v24.4s
+          FCVTNS  v25.4s, v25.4s
+          FCVTNS  v26.4s, v26.4s
+          FCVTNS  v27.4s, v27.4s
+          FCVTNS  v28.4s, v28.4s
+          FCVTNS  v29.4s, v29.4s
+          FCVTNS  v30.4s, v30.4s
+          FCVTNS  v31.4s, v31.4s
+
 
         SQXTN   v16.4h, v16.4s
         SQXTN   v17.4h, v17.4s
@@ -280,7 +339,7 @@
         SQXTN   v25.4h, v25.4s
         SQXTN   v26.4h, v26.4s
         SQXTN   v27.4h, v27.4s
-        LD1R    {v2.8h}, [x8], 2        // add bias
+        LD1R    {v2.8h}, [x8], 2       // add bias
 
         SQXTN2  v16.8h, v20.4s
         SQXTN2  v17.8h, v21.4s
@@ -299,13 +358,13 @@
         SQADD   v25.8h, v25.8h, v2.8h
         SQADD   v26.8h, v26.8h, v2.8h
         SQADD   v27.8h, v27.8h, v2.8h
-        LD1R    {v0.16b}, [x8], 1       // clamp min value
+        LD1R    {v0.16b}, [x8], 1      // clamp min value
 
         SQXTN   v4.8b, v16.8h
         SQXTN   v5.8b, v17.8h
         SQXTN   v6.8b, v18.8h
         SQXTN   v7.8b, v19.8h
-        LD1R    {v1.16b}, [x8]          // clamp max value
+        LD1R    {v1.16b}, [x8]         // clamp max value
         SQXTN2  v4.16b, v24.8h
         SQXTN2  v5.16b, v25.8h
         SQXTN2  v6.16b, v26.8h
@@ -454,7 +513,7 @@
 10:
         RET
 
-END_FUNCTION xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128
+END_FUNCTION xnn_qs8_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_ld128
 
 #ifdef __ELF__
 .section ".note.GNU-stack","",%progbits
diff --git a/src/qs8-igemm/4x16c4-aarch64-neondot-ld64.S.in b/src/qs8-igemm/4x16c4-aarch64-neondot-ld64.S.in
index b463c7f..44e95a5 100644
--- a/src/qs8-igemm/4x16c4-aarch64-neondot-ld64.S.in
+++ b/src/qs8-igemm/4x16c4-aarch64-neondot-ld64.S.in
@@ -5,7 +5,10 @@
 
 #include <xnnpack/assembly.h>
 
-# void xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64(
+$assert REQUANTIZATION in ["GEMMLOWP", "FP32"]
+$CONV_PARAMS = "xnn_qs8_minmax_params"
+
+# void xnn_qs8_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_ld64(
 #     size_t mr,                 x0
 #     size_t nc,                 x1
 #     size_t kc,                 x2 / x0
@@ -17,7 +20,7 @@
 #     size_t cn_stride,                  [sp] -> x10
 #     size_t a_offset,                   [sp + 8] -> x11
 #     const float* zero,                 [sp + 16] -> x12
-#     const xnn_qs8_conv_minmax_params params [sp + 24] -> (x8)
+#     const union ${CONV_PARAMS} params [sp + 24] -> (x8)
 
 # d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
 
@@ -33,7 +36,7 @@
 # C3   x7 v19 v23 v27 v31
 # unused v8 v9 v10 v11 v12 v13 v14 v15
 
-BEGIN_FUNCTION xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64
+BEGIN_FUNCTION xnn_qs8_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_ld64
 
         # Clamp C pointers
         CMP     x0, 2                   // if mr < 2
@@ -151,86 +154,141 @@
         B.HI    1b
 
 3:
-        # Apply params - scale, shift, bias and clamp
-        LD2R    {v0.4s, v1.4s}, [x8], 8
-        CMEQ    v2.4s, v1.4s, 0
+        $if REQUANTIZATION == "GEMMLOWP":
+          # Apply params - scale, shift, bias and clamp
+          LD2R    {v0.4s, v1.4s}, [x8], 8
+          CMEQ    v2.4s, v1.4s, 0
 
-        BIC     v4.16b, v16.16b, v2.16b
-        BIC     v5.16b, v17.16b, v2.16b
-        BIC     v6.16b, v18.16b, v2.16b
-        BIC     v7.16b, v19.16b, v2.16b
+          BIC     v4.16b, v16.16b, v2.16b
+          BIC     v5.16b, v17.16b, v2.16b
+          BIC     v6.16b, v18.16b, v2.16b
+          BIC     v7.16b, v19.16b, v2.16b
 
-        SQRDMULH v16.4s, v16.4s, v0.4s
-        SQRDMULH v17.4s, v17.4s, v0.4s
-        SQRDMULH v18.4s, v18.4s, v0.4s
-        SQRDMULH v19.4s, v19.4s, v0.4s
+          SQRDMULH v16.4s, v16.4s, v0.4s
+          SQRDMULH v17.4s, v17.4s, v0.4s
+          SQRDMULH v18.4s, v18.4s, v0.4s
+          SQRDMULH v19.4s, v19.4s, v0.4s
 
-        SSRA    v16.4s, v4.4s, 31       // signed shift right accumulate
-        SSRA    v17.4s, v5.4s, 31
-        SSRA    v18.4s, v6.4s, 31
-        SSRA    v19.4s, v7.4s, 31
+          SSRA    v16.4s, v4.4s, 31       // signed shift right accumulate
+          SSRA    v17.4s, v5.4s, 31
+          SSRA    v18.4s, v6.4s, 31
+          SSRA    v19.4s, v7.4s, 31
 
-        BIC     v4.16b, v20.16b, v2.16b
-        BIC     v5.16b, v21.16b, v2.16b
-        BIC     v6.16b, v22.16b, v2.16b
-        BIC     v7.16b, v23.16b, v2.16b
+          BIC     v4.16b, v20.16b, v2.16b
+          BIC     v5.16b, v21.16b, v2.16b
+          BIC     v6.16b, v22.16b, v2.16b
+          BIC     v7.16b, v23.16b, v2.16b
 
-        SQRDMULH v20.4s, v20.4s, v0.4s
-        SQRDMULH v21.4s, v21.4s, v0.4s
-        SQRDMULH v22.4s, v22.4s, v0.4s
-        SQRDMULH v23.4s, v23.4s, v0.4s
+          SQRDMULH v20.4s, v20.4s, v0.4s
+          SQRDMULH v21.4s, v21.4s, v0.4s
+          SQRDMULH v22.4s, v22.4s, v0.4s
+          SQRDMULH v23.4s, v23.4s, v0.4s
 
-        SSRA    v20.4s, v4.4s, 31
-        SSRA    v21.4s, v5.4s, 31
-        SSRA    v22.4s, v6.4s, 31
-        SSRA    v23.4s, v7.4s, 31
+          SSRA    v20.4s, v4.4s, 31
+          SSRA    v21.4s, v5.4s, 31
+          SSRA    v22.4s, v6.4s, 31
+          SSRA    v23.4s, v7.4s, 31
 
-        BIC     v4.16b, v24.16b, v2.16b
-        BIC     v5.16b, v25.16b, v2.16b
-        BIC     v6.16b, v26.16b, v2.16b
-        BIC     v7.16b, v27.16b, v2.16b
+          BIC     v4.16b, v24.16b, v2.16b
+          BIC     v5.16b, v25.16b, v2.16b
+          BIC     v6.16b, v26.16b, v2.16b
+          BIC     v7.16b, v27.16b, v2.16b
 
-        SQRDMULH v24.4s, v24.4s, v0.4s
-        SQRDMULH v25.4s, v25.4s, v0.4s
-        SQRDMULH v26.4s, v26.4s, v0.4s
-        SQRDMULH v27.4s, v27.4s, v0.4s
+          SQRDMULH v24.4s, v24.4s, v0.4s
+          SQRDMULH v25.4s, v25.4s, v0.4s
+          SQRDMULH v26.4s, v26.4s, v0.4s
+          SQRDMULH v27.4s, v27.4s, v0.4s
 
-        SSRA    v24.4s, v4.4s, 31
-        SSRA    v25.4s, v5.4s, 31
-        SSRA    v26.4s, v6.4s, 31
-        SSRA    v27.4s, v7.4s, 31
+          SSRA    v24.4s, v4.4s, 31
+          SSRA    v25.4s, v5.4s, 31
+          SSRA    v26.4s, v6.4s, 31
+          SSRA    v27.4s, v7.4s, 31
 
-        BIC     v4.16b, v28.16b, v2.16b
-        BIC     v5.16b, v29.16b, v2.16b
-        BIC     v6.16b, v30.16b, v2.16b
-        BIC     v7.16b, v31.16b, v2.16b
+          BIC     v4.16b, v28.16b, v2.16b
+          BIC     v5.16b, v29.16b, v2.16b
+          BIC     v6.16b, v30.16b, v2.16b
+          BIC     v7.16b, v31.16b, v2.16b
 
-        SQRDMULH v28.4s, v28.4s, v0.4s
-        SQRDMULH v29.4s, v29.4s, v0.4s
-        SQRDMULH v30.4s, v30.4s, v0.4s
-        SQRDMULH v31.4s, v31.4s, v0.4s
+          SQRDMULH v28.4s, v28.4s, v0.4s
+          SQRDMULH v29.4s, v29.4s, v0.4s
+          SQRDMULH v30.4s, v30.4s, v0.4s
+          SQRDMULH v31.4s, v31.4s, v0.4s
 
-        SSRA    v28.4s, v4.4s, 31
-        SSRA    v29.4s, v5.4s, 31
-        SSRA    v30.4s, v6.4s, 31
-        SSRA    v31.4s, v7.4s, 31
+          SSRA    v28.4s, v4.4s, 31
+          SSRA    v29.4s, v5.4s, 31
+          SSRA    v30.4s, v6.4s, 31
+          SSRA    v31.4s, v7.4s, 31
 
-        SRSHL   v16.4s, v16.4s, v1.4s   // signed rounding shift left
-        SRSHL   v17.4s, v17.4s, v1.4s
-        SRSHL   v18.4s, v18.4s, v1.4s
-        SRSHL   v19.4s, v19.4s, v1.4s
-        SRSHL   v20.4s, v20.4s, v1.4s
-        SRSHL   v21.4s, v21.4s, v1.4s
-        SRSHL   v22.4s, v22.4s, v1.4s
-        SRSHL   v23.4s, v23.4s, v1.4s
-        SRSHL   v24.4s, v24.4s, v1.4s
-        SRSHL   v25.4s, v25.4s, v1.4s
-        SRSHL   v26.4s, v26.4s, v1.4s
-        SRSHL   v27.4s, v27.4s, v1.4s
-        SRSHL   v28.4s, v28.4s, v1.4s
-        SRSHL   v29.4s, v29.4s, v1.4s
-        SRSHL   v30.4s, v30.4s, v1.4s
-        SRSHL   v31.4s, v31.4s, v1.4s
+          SRSHL   v16.4s, v16.4s, v1.4s   // signed rounding shift left
+          SRSHL   v17.4s, v17.4s, v1.4s
+          SRSHL   v18.4s, v18.4s, v1.4s
+          SRSHL   v19.4s, v19.4s, v1.4s
+          SRSHL   v20.4s, v20.4s, v1.4s
+          SRSHL   v21.4s, v21.4s, v1.4s
+          SRSHL   v22.4s, v22.4s, v1.4s
+          SRSHL   v23.4s, v23.4s, v1.4s
+          SRSHL   v24.4s, v24.4s, v1.4s
+          SRSHL   v25.4s, v25.4s, v1.4s
+          SRSHL   v26.4s, v26.4s, v1.4s
+          SRSHL   v27.4s, v27.4s, v1.4s
+          SRSHL   v28.4s, v28.4s, v1.4s
+          SRSHL   v29.4s, v29.4s, v1.4s
+          SRSHL   v30.4s, v30.4s, v1.4s
+          SRSHL   v31.4s, v31.4s, v1.4s
+        $elif REQUANTIZATION == "FP32":
+          # Apply params - scale, bias and clamp
+          LD1R    {v0.4s}, [x8], 4
+
+          SCVTF   v16.4s, v16.4s
+          SCVTF   v17.4s, v17.4s
+          SCVTF   v18.4s, v18.4s
+          SCVTF   v19.4s, v19.4s
+          SCVTF   v20.4s, v20.4s
+          SCVTF   v21.4s, v21.4s
+          SCVTF   v22.4s, v22.4s
+          SCVTF   v23.4s, v23.4s
+          SCVTF   v24.4s, v24.4s
+          SCVTF   v25.4s, v25.4s
+          SCVTF   v26.4s, v26.4s
+          SCVTF   v27.4s, v27.4s
+          SCVTF   v28.4s, v28.4s
+          SCVTF   v29.4s, v29.4s
+          SCVTF   v30.4s, v30.4s
+          SCVTF   v31.4s, v31.4s
+
+          FMUL    v16.4s, v0.4s, v16.4s
+          FMUL    v17.4s, v0.4s, v17.4s
+          FMUL    v18.4s, v0.4s, v18.4s
+          FMUL    v19.4s, v0.4s, v19.4s
+          FMUL    v20.4s, v0.4s, v20.4s
+          FMUL    v21.4s, v0.4s, v21.4s
+          FMUL    v22.4s, v0.4s, v22.4s
+          FMUL    v23.4s, v0.4s, v23.4s
+          FMUL    v24.4s, v0.4s, v24.4s
+          FMUL    v25.4s, v0.4s, v25.4s
+          FMUL    v26.4s, v0.4s, v26.4s
+          FMUL    v27.4s, v0.4s, v27.4s
+          FMUL    v28.4s, v0.4s, v28.4s
+          FMUL    v29.4s, v0.4s, v29.4s
+          FMUL    v30.4s, v0.4s, v30.4s
+          FMUL    v31.4s, v0.4s, v31.4s
+
+          FCVTNS  v16.4s, v16.4s
+          FCVTNS  v17.4s, v17.4s
+          FCVTNS  v18.4s, v18.4s
+          FCVTNS  v19.4s, v19.4s
+          FCVTNS  v20.4s, v20.4s
+          FCVTNS  v21.4s, v21.4s
+          FCVTNS  v22.4s, v22.4s
+          FCVTNS  v23.4s, v23.4s
+          FCVTNS  v24.4s, v24.4s
+          FCVTNS  v25.4s, v25.4s
+          FCVTNS  v26.4s, v26.4s
+          FCVTNS  v27.4s, v27.4s
+          FCVTNS  v28.4s, v28.4s
+          FCVTNS  v29.4s, v29.4s
+          FCVTNS  v30.4s, v30.4s
+          FCVTNS  v31.4s, v31.4s
 
         SQXTN   v16.4h, v16.4s
         SQXTN   v17.4h, v17.4s
@@ -368,7 +426,7 @@
 9:
         RET
 
-END_FUNCTION xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64
+END_FUNCTION xnn_qs8_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_ld64
 
 #ifdef __ELF__
 .section ".note.GNU-stack","",%progbits
diff --git a/src/qs8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-cortex-a55.S b/src/qs8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-cortex-a55.S
new file mode 100644
index 0000000..beea3aa
--- /dev/null
+++ b/src/qs8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-cortex-a55.S
@@ -0,0 +1,675 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/4x16c4-aarch64-neondot-cortex-a55.S.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <xnnpack/assembly.h>
+
+
+# void xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55(
+#     size_t mr,                 x0
+#     size_t nc,                 x1
+#     size_t kc,                 x2 / x0
+#     size_t ks,                 x3 / x9
+#     const int8_t**restrict a,  x4
+#     const int8_t* restrict w,  x5
+#     int8_t* restrict c,        x6
+#     size_t cm_stride,          x7
+#     size_t cn_stride,                  [sp] -> (x0)
+#     size_t a_offset,                   [sp + 8] -> x11
+#     const float* zero,                 [sp + 16] -> x12
+#     const union xnn_qs8_minmax_params params [sp + 24] -> (x8)
+
+# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+# Register usage
+# A0  x13  v0  v4
+# A1  x14  v1  v5
+# A2  x15  v2  v6
+# A3  x10  v3  v7
+# B    x5  v8  v9 v10 v11
+# C0   x6 v16 v20 v24 v28
+# C1  x16 v17 v21 v25 v29
+# C2  x17 v18 v22 v26 v30
+# C3   x7 v19 v23 v27 v31
+# unused v12 v13 v14 v15
+
+# x8 temp for Cortex-A55 loads
+
+BEGIN_FUNCTION xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55
+
+        # Clamp C pointers
+        CMP     x0, 2                   // if mr < 2
+        LDR     x11, [sp, 8]            // Load a_offset
+        ADD     x16, x6, x7             // c1 = c0 + cm_stride
+        LDP     x12, x8, [sp, 16]       // Load zero, params pointer
+        CSEL    x16, x6,  x16, LO       //   c1 = c0
+        ADD     x2, x2, 3               // kc = (kc + 3) & ~3
+        STP     d8,  d9, [sp, -32]!     // Save d8-d11 on stack
+
+        ADD     x17, x16, x7            // c2 = c1 + cm_stride
+        STP     d10, d11, [sp, 16]
+                                        // if mr <= 2
+        CSEL    x17, x16, x17, LS       //   c2 = c1
+        BIC     x2, x2, 3
+
+        CMP     x0, 4                   // if mr < 4
+        ADD     x7,  x17, x7            // c3 = c2 + cm_stride
+        CSEL    x7,  x17, x7, LO        //   c3 = c2
+
+        .p2align 3
+0:
+        # Load initial bias from w into accumulators
+        LDP     q16, q20, [x5], 32
+        MOV     v17.16b, v16.16b
+        MOV     v18.16b, v16.16b
+        LDP     q24, q28, [x5], 32
+        MOV     v19.16b, v16.16b
+        MOV     v21.16b, v20.16b
+        MOV     v22.16b, v20.16b
+        MOV     v23.16b, v20.16b
+        MOV     v25.16b, v24.16b
+        MOV     v26.16b, v24.16b
+        MOV     v27.16b, v24.16b
+        MOV     v29.16b, v28.16b
+        MOV     v30.16b, v28.16b
+        MOV     v31.16b, v28.16b
+        MOV     x9, x3                  // p = ks
+
+        .p2align 3
+1:
+        # Load next 4 A pointers
+        LDP     x13, x14, [x4], 16
+        LDP     x15, x10, [x4], 16
+
+        CMP     x13, x12                // if a0 == zero
+        ADD     x13, x13, x11           // a0 += a_offset
+        CSEL    x13, x12, x13, EQ       //   a0 = zero, else += a0 + a_offset
+        CMP     x14, x12                // if a1 == zero
+        ADD     x14, x14, x11           // a1 += a_offset
+        CSEL    x14, x12, x14, EQ       //   a1 = zero, else += a1 + a_offset
+        CMP     x15, x12                // if a2 == zero
+        ADD     x15, x15, x11           // a2 += a_offset
+        CSEL    x15, x12, x15, EQ       //   a2 = zero, else += a2 + a_offset
+        CMP     x10, x12                // if a3 == zero
+        ADD     x10, x10, x11           // a3 += a_offset
+        CSEL    x10, x12, x10, EQ       //   a3 = zero, else += a3 + a_offset
+
+        # Is there at least 16 bytes for prologue/epilogue?
+        SUBS    x0, x2, 16              // k = kc - 16
+        B.LO    5f
+
+        # prologue - read A and B values for block 0 and 1
+        LDR     d0, [x13], 8
+        LDR     q8,  [x5], 16
+        LDR     d1, [x14], 8
+        LDR     d2, [x15], 8
+        LDR     d3, [x10], 8
+        SUBS    x0, x0, 16              // is there 16 for main loop?
+        LDR     d9,  [x5], 8
+        LDR     x8,  [x5], 8
+        # Is there at least 16 bytes for main loop?
+        B.LO    3f
+
+        # Main loop - 16 bytes of A in 4 groups.
+        # 4 row of 4 vectors wide = 16 sdot instructions for 4 channels
+        # 4 LD64 for A
+        # 4 LD128 for W. = 2 LD64 + INS.
+        # for each 4 sdot, 1 LD64 for A, 2 LD64 for W + INS.
+
+        .p2align 3
+2:
+        # BLOCK 0
+        SDOT    v16.4s,  v8.16b, v0.4b[0]
+        LDR     d10,  [x5], 8
+        SDOT    v17.4s,  v8.16b, v1.4b[0]
+        INS     v9.d[1], x8
+        SDOT    v18.4s,  v8.16b, v2.4b[0]
+        LDR     x8,  [x5], 8
+        SDOT    v19.4s,  v8.16b, v3.4b[0]
+        LDR     d4,  [x13], 8
+
+        # BLOCK 1
+        SDOT    v20.4s,  v9.16b, v0.4b[0]
+        LDR     d11,  [x5], 8
+        SDOT    v21.4s,  v9.16b, v1.4b[0]
+        INS     v10.d[1], x8
+        SDOT    v22.4s,  v9.16b, v2.4b[0]
+        LDR     x8,  [x5], 8
+        SDOT    v23.4s,  v9.16b, v3.4b[0]
+        LDR     d5, [x14], 8
+
+        # BLOCK 2
+        SDOT    v24.4s, v10.16b, v0.4b[0]
+        LDR     d8,  [x5], 8
+        SDOT    v25.4s, v10.16b, v1.4b[0]
+        INS     v11.d[1], x8
+        SDOT    v26.4s, v10.16b, v2.4b[0]
+        LDR     x8,  [x5], 8
+        SDOT    v27.4s, v10.16b, v3.4b[0]
+        LDR     d6, [x15], 8
+
+        # BLOCK 3
+        SDOT    v28.4s, v11.16b, v0.4b[0]
+        LDR     d9,  [x5], 8
+        SDOT    v29.4s, v11.16b, v1.4b[0]
+        INS     v8.d[1], x8
+        SDOT    v30.4s, v11.16b, v2.4b[0]
+        LDR     x8,  [x5], 8
+        SDOT    v31.4s, v11.16b, v3.4b[0]
+        LDR     d7,  [x10], 8
+
+        # BLOCK 0
+        SDOT    v16.4s,  v8.16b, v0.4b[1]
+        LDR     d10,  [x5], 8
+        SDOT    v17.4s,  v8.16b, v1.4b[1]
+        INS     v9.d[1], x8
+        SDOT    v18.4s,  v8.16b, v2.4b[1]
+        LDR     x8,  [x5], 8
+        SDOT    v19.4s,  v8.16b, v3.4b[1]
+
+        # BLOCK 1
+        SDOT    v20.4s,  v9.16b, v0.4b[1]
+        LDR     d11,  [x5], 8
+        SDOT    v21.4s,  v9.16b, v1.4b[1]
+        INS     v10.d[1], x8
+        SDOT    v22.4s,  v9.16b, v2.4b[1]
+        LDR     x8,  [x5], 8
+        SDOT    v23.4s,  v9.16b, v3.4b[1]
+
+        # BLOCK 2
+        SDOT    v24.4s, v10.16b, v0.4b[1]
+        LDR     d8,  [x5], 8
+        SDOT    v25.4s, v10.16b, v1.4b[1]
+        INS     v11.d[1], x8
+        SDOT    v26.4s, v10.16b, v2.4b[1]
+        LDR     x8,  [x5], 8
+        SDOT    v27.4s, v10.16b, v3.4b[1]
+
+        # BLOCK 4
+        SDOT    v28.4s, v11.16b, v0.4b[1]
+        LDR     d9,  [x5], 8
+        SDOT    v29.4s, v11.16b, v1.4b[1]
+        INS     v8.d[1], x8
+        SDOT    v30.4s, v11.16b, v2.4b[1]
+        LDR     x8,  [x5], 8
+        SDOT    v31.4s, v11.16b, v3.4b[1]
+
+        # BLOCK 0
+        SDOT    v16.4s,  v8.16b, v4.4b[0]
+        LDR     d10,  [x5], 8
+        SDOT    v17.4s,  v8.16b, v5.4b[0]
+        INS     v9.d[1], x8
+        SDOT    v18.4s,  v8.16b, v6.4b[0]
+        LDR     x8,  [x5], 8
+        SDOT    v19.4s,  v8.16b, v7.4b[0]
+        LDR     d0,  [x13], 8
+
+        # BLOCK 1
+        SDOT    v20.4s,  v9.16b, v4.4b[0]
+        LDR     d11,  [x5], 8
+        SDOT    v21.4s,  v9.16b, v5.4b[0]
+        INS     v10.d[1], x8
+        SDOT    v22.4s,  v9.16b, v6.4b[0]
+        LDR     x8,  [x5], 8
+        SDOT    v23.4s,  v9.16b, v7.4b[0]
+        LDR     d1, [x14], 8
+
+        # BLOCK 2
+        SDOT    v24.4s, v10.16b, v4.4b[0]
+        LDR     d8,  [x5], 8
+        SDOT    v25.4s, v10.16b, v5.4b[0]
+        INS     v11.d[1], x8
+        SDOT    v26.4s, v10.16b, v6.4b[0]
+        LDR     x8,  [x5], 8
+        SDOT    v27.4s, v10.16b, v7.4b[0]
+        LDR     d2, [x15], 8
+
+        # BLOCK 3
+        SDOT    v28.4s, v11.16b, v4.4b[0]
+        LDR     d9,  [x5], 8
+        SDOT    v29.4s, v11.16b, v5.4b[0]
+        INS     v8.d[1], x8
+        SDOT    v30.4s, v11.16b, v6.4b[0]
+        LDR     x8,  [x5], 8
+        SDOT    v31.4s, v11.16b, v7.4b[0]
+        LDR     d3,  [x10], 8
+
+        # BLOCK 0
+        SDOT    v16.4s,  v8.16b, v4.4b[1]
+        LDR     d10,  [x5], 8
+        SDOT    v17.4s,  v8.16b, v5.4b[1]
+        INS     v9.d[1], x8
+        SDOT    v18.4s,  v8.16b, v6.4b[1]
+        LDR     x8,  [x5], 8
+        SDOT    v19.4s,  v8.16b, v7.4b[1]
+
+        # BLOCK 1
+        SDOT    v20.4s,  v9.16b, v4.4b[1]
+        LDR     d11,  [x5], 8
+        SDOT    v21.4s,  v9.16b, v5.4b[1]
+        INS     v10.d[1], x8
+        SDOT    v22.4s,  v9.16b, v6.4b[1]
+        LDR     x8,  [x5], 8
+        SDOT    v23.4s,  v9.16b, v7.4b[1]
+
+        # BLOCK 2
+        SDOT    v24.4s, v10.16b, v4.4b[1]
+        LDR     d8,  [x5], 8            // First B values for block 0 and 1
+        SDOT    v25.4s, v10.16b, v5.4b[1]
+        INS     v11.d[1], x8
+        SDOT    v26.4s, v10.16b, v6.4b[1]
+        LDR     x8,  [x5], 8
+        SDOT    v27.4s, v10.16b, v7.4b[1]
+        SUBS    x0, x0, 16
+
+        # BLOCK 3
+        SDOT    v28.4s, v11.16b, v4.4b[1]
+        LDR     d9,  [x5], 8
+        SDOT    v29.4s, v11.16b, v5.4b[1]
+        INS     v8.d[1], x8
+        SDOT    v30.4s, v11.16b, v6.4b[1]
+        LDR     x8,  [x5], 8
+        SDOT    v31.4s, v11.16b, v7.4b[1]
+        B.HS    2b
+
+        # Epilogue.  Same as main loop but no preloads in final group
+3:
+        # BLOCK 0
+        SDOT    v16.4s,  v8.16b, v0.4b[0]
+        LDR     d10,  [x5], 8
+        SDOT    v17.4s,  v8.16b, v1.4b[0]
+        INS     v9.d[1], x8
+        SDOT    v18.4s,  v8.16b, v2.4b[0]
+        LDR     x8,  [x5], 8
+        SDOT    v19.4s,  v8.16b, v3.4b[0]
+        LDR     d4,  [x13], 8
+
+        # BLOCK 1
+        SDOT    v20.4s,  v9.16b, v0.4b[0]
+        LDR     d11,  [x5], 8
+        SDOT    v21.4s,  v9.16b, v1.4b[0]
+        INS     v10.d[1], x8
+        SDOT    v22.4s,  v9.16b, v2.4b[0]
+        LDR     x8,  [x5], 8
+        SDOT    v23.4s,  v9.16b, v3.4b[0]
+        LDR     d5, [x14], 8
+
+        # BLOCK 2
+        SDOT    v24.4s, v10.16b, v0.4b[0]
+        LDR     d8,  [x5], 8
+        SDOT    v25.4s, v10.16b, v1.4b[0]
+        INS     v11.d[1], x8
+        SDOT    v26.4s, v10.16b, v2.4b[0]
+        LDR     x8,  [x5], 8
+        SDOT    v27.4s, v10.16b, v3.4b[0]
+        LDR     d6, [x15], 8
+
+        # BLOCK 3
+        SDOT    v28.4s, v11.16b, v0.4b[0]
+        LDR     d9,  [x5], 8
+        SDOT    v29.4s, v11.16b, v1.4b[0]
+        INS     v8.d[1], x8
+        SDOT    v30.4s, v11.16b, v2.4b[0]
+        LDR     x8,  [x5], 8
+        SDOT    v31.4s, v11.16b, v3.4b[0]
+        LDR     d7,  [x10], 8
+
+        # BLOCK 0
+        SDOT    v16.4s,  v8.16b, v0.4b[1]
+        LDR     d10,  [x5], 8
+        SDOT    v17.4s,  v8.16b, v1.4b[1]
+        INS     v9.d[1], x8
+        SDOT    v18.4s,  v8.16b, v2.4b[1]
+        LDR     x8,  [x5], 8
+        SDOT    v19.4s,  v8.16b, v3.4b[1]
+
+        # BLOCK 1
+        SDOT    v20.4s,  v9.16b, v0.4b[1]
+        LDR     d11,  [x5], 8
+        SDOT    v21.4s,  v9.16b, v1.4b[1]
+        INS     v10.d[1], x8
+        SDOT    v22.4s,  v9.16b, v2.4b[1]
+        LDR     x8,  [x5], 8
+        SDOT    v23.4s,  v9.16b, v3.4b[1]
+
+        # BLOCK 2
+        SDOT    v24.4s, v10.16b, v0.4b[1]
+        LDR     d8,  [x5], 8
+        SDOT    v25.4s, v10.16b, v1.4b[1]
+        INS     v11.d[1], x8
+        SDOT    v26.4s, v10.16b, v2.4b[1]
+        LDR     x8,  [x5], 8
+        SDOT    v27.4s, v10.16b, v3.4b[1]
+
+        # BLOCK 4
+        SDOT    v28.4s, v11.16b, v0.4b[1]
+        LDR     d9,  [x5], 8
+        SDOT    v29.4s, v11.16b, v1.4b[1]
+        INS     v8.d[1], x8
+        SDOT    v30.4s, v11.16b, v2.4b[1]
+        LDR     x8,  [x5], 8
+        SDOT    v31.4s, v11.16b, v3.4b[1]
+
+        # BLOCK 0
+        SDOT    v16.4s,  v8.16b, v4.4b[0]
+        LDR     d10,  [x5], 8
+        SDOT    v17.4s,  v8.16b, v5.4b[0]
+        INS     v9.d[1], x8
+        SDOT    v18.4s,  v8.16b, v6.4b[0]
+        LDR     x8,  [x5], 8
+        SDOT    v19.4s,  v8.16b, v7.4b[0]
+
+        # BLOCK 1
+        SDOT    v20.4s,  v9.16b, v4.4b[0]
+        LDR     d11,  [x5], 8
+        SDOT    v21.4s,  v9.16b, v5.4b[0]
+        INS     v10.d[1], x8
+        SDOT    v22.4s,  v9.16b, v6.4b[0]
+        LDR     x8,  [x5], 8
+        SDOT    v23.4s,  v9.16b, v7.4b[0]
+
+        # BLOCK 2
+        SDOT    v24.4s, v10.16b, v4.4b[0]
+        LDR     d8,  [x5], 8
+        SDOT    v25.4s, v10.16b, v5.4b[0]
+        INS     v11.d[1], x8
+        SDOT    v26.4s, v10.16b, v6.4b[0]
+        LDR     x8,  [x5], 8
+        SDOT    v27.4s, v10.16b, v7.4b[0]
+
+        # BLOCK 3
+        SDOT    v28.4s, v11.16b, v4.4b[0]
+        LDR     d9,  [x5], 8
+        SDOT    v29.4s, v11.16b, v5.4b[0]
+        INS     v8.d[1], x8
+        SDOT    v30.4s, v11.16b, v6.4b[0]
+        LDR     x8,  [x5], 8
+        SDOT    v31.4s, v11.16b, v7.4b[0]
+
+        # BLOCK 0
+        SDOT    v16.4s,  v8.16b, v4.4b[1]
+        LDR     d10,  [x5], 8
+        SDOT    v17.4s,  v8.16b, v5.4b[1]
+        INS     v9.d[1], x8
+        SDOT    v18.4s,  v8.16b, v6.4b[1]
+        LDR     x8,  [x5], 8
+        SDOT    v19.4s,  v8.16b, v7.4b[1]
+
+        # BLOCK 1
+        SDOT    v20.4s,  v9.16b, v4.4b[1]
+        LDR     d11,  [x5], 8
+        SDOT    v21.4s,  v9.16b, v5.4b[1]
+        INS     v10.d[1], x8
+        SDOT    v22.4s,  v9.16b, v6.4b[1]
+        LDR     x8,  [x5], 8
+        SDOT    v23.4s,  v9.16b, v7.4b[1]
+
+        # BLOCK 2
+        SDOT    v24.4s, v10.16b, v4.4b[1]
+        SDOT    v25.4s, v10.16b, v5.4b[1]
+        INS     v11.d[1], x8
+        SDOT    v26.4s, v10.16b, v6.4b[1]
+        SDOT    v27.4s, v10.16b, v7.4b[1]
+        AND     x0, x2, 15              // kc remainder 0 to 12
+
+        # BLOCK 3
+        SDOT    v28.4s, v11.16b, v4.4b[1]
+        SDOT    v29.4s, v11.16b, v5.4b[1]
+        LDR     x8, [sp, 56]            // reload params pointer
+        SDOT    v30.4s, v11.16b, v6.4b[1]
+        SDOT    v31.4s, v11.16b, v7.4b[1]
+
+        # Is there a remainder?- 4 to 12 bytes of A
+        CBNZ    x0, 6f
+
+        .p2align 3
+4:
+        # ks loop
+        SUBS    x9, x9, 32              // ks -= MR * sizeof(int8_t*)
+        B.HI    1b
+
+        # Apply params - scale, bias and clamp
+        LD1R    {v0.4s}, [x8], 4
+
+        SCVTF   v16.4s, v16.4s
+        SCVTF   v17.4s, v17.4s
+        SCVTF   v18.4s, v18.4s
+        SCVTF   v19.4s, v19.4s
+        SCVTF   v20.4s, v20.4s
+        SCVTF   v21.4s, v21.4s
+        SCVTF   v22.4s, v22.4s
+        SCVTF   v23.4s, v23.4s
+        SCVTF   v24.4s, v24.4s
+        SCVTF   v25.4s, v25.4s
+        SCVTF   v26.4s, v26.4s
+        SCVTF   v27.4s, v27.4s
+        SCVTF   v28.4s, v28.4s
+        SCVTF   v29.4s, v29.4s
+        SCVTF   v30.4s, v30.4s
+        SCVTF   v31.4s, v31.4s
+
+        FMUL    v16.4s, v0.4s, v16.4s
+        FMUL    v17.4s, v0.4s, v17.4s
+        FMUL    v18.4s, v0.4s, v18.4s
+        FMUL    v19.4s, v0.4s, v19.4s
+        FMUL    v20.4s, v0.4s, v20.4s
+        FMUL    v21.4s, v0.4s, v21.4s
+        FMUL    v22.4s, v0.4s, v22.4s
+        FMUL    v23.4s, v0.4s, v23.4s
+        FMUL    v24.4s, v0.4s, v24.4s
+        FMUL    v25.4s, v0.4s, v25.4s
+        FMUL    v26.4s, v0.4s, v26.4s
+        FMUL    v27.4s, v0.4s, v27.4s
+        FMUL    v28.4s, v0.4s, v28.4s
+        FMUL    v29.4s, v0.4s, v29.4s
+        FMUL    v30.4s, v0.4s, v30.4s
+        FMUL    v31.4s, v0.4s, v31.4s
+
+        FCVTNS  v16.4s, v16.4s
+        FCVTNS  v17.4s, v17.4s
+        FCVTNS  v18.4s, v18.4s
+        FCVTNS  v19.4s, v19.4s
+        FCVTNS  v20.4s, v20.4s
+        FCVTNS  v21.4s, v21.4s
+        FCVTNS  v22.4s, v22.4s
+        FCVTNS  v23.4s, v23.4s
+        FCVTNS  v24.4s, v24.4s
+        FCVTNS  v25.4s, v25.4s
+        FCVTNS  v26.4s, v26.4s
+        FCVTNS  v27.4s, v27.4s
+        FCVTNS  v28.4s, v28.4s
+        FCVTNS  v29.4s, v29.4s
+        FCVTNS  v30.4s, v30.4s
+        FCVTNS  v31.4s, v31.4s
+
+        SQXTN   v16.4h, v16.4s
+        SQXTN   v17.4h, v17.4s
+        SQXTN   v18.4h, v18.4s
+        SQXTN   v19.4h, v19.4s
+        SQXTN   v24.4h, v24.4s
+        SQXTN   v25.4h, v25.4s
+        SQXTN   v26.4h, v26.4s
+        SQXTN   v27.4h, v27.4s
+        LD1R    {v2.8h}, [x8], 2       // add bias
+
+        SQXTN2  v16.8h, v20.4s
+        SQXTN2  v17.8h, v21.4s
+        SQXTN2  v18.8h, v22.4s
+        SQXTN2  v19.8h, v23.4s
+        SQXTN2  v24.8h, v28.4s
+        SQXTN2  v25.8h, v29.4s
+        SQXTN2  v26.8h, v30.4s
+        SQXTN2  v27.8h, v31.4s
+
+        SQADD   v16.8h, v16.8h, v2.8h
+        SQADD   v17.8h, v17.8h, v2.8h
+        SQADD   v18.8h, v18.8h, v2.8h
+        SQADD   v19.8h, v19.8h, v2.8h
+        SQADD   v24.8h, v24.8h, v2.8h
+        SQADD   v25.8h, v25.8h, v2.8h
+        SQADD   v26.8h, v26.8h, v2.8h
+        SQADD   v27.8h, v27.8h, v2.8h
+        LD1R    {v0.16b}, [x8], 1      // clamp min value
+
+        SQXTN   v4.8b, v16.8h
+        SQXTN   v5.8b, v17.8h
+        SQXTN   v6.8b, v18.8h
+        SQXTN   v7.8b, v19.8h
+        LD1R    {v1.16b}, [x8]         // clamp max value
+        SQXTN2  v4.16b, v24.8h
+        SQXTN2  v5.16b, v25.8h
+        SQXTN2  v6.16b, v26.8h
+        SQXTN2  v7.16b, v27.8h
+        SUB     x8, x8, 7               // rewind params pointer
+        SMAX    v4.16b, v4.16b, v0.16b
+        SMAX    v5.16b, v5.16b, v0.16b
+        LDR     x0, [sp, 32]           // cn_stride
+        SMAX    v6.16b, v6.16b, v0.16b
+        SMAX    v7.16b, v7.16b, v0.16b
+        SUBS    x1, x1, 16
+        SMIN    v4.16b, v4.16b, v1.16b
+        SMIN    v5.16b, v5.16b, v1.16b
+        SMIN    v6.16b, v6.16b, v1.16b
+        SMIN    v7.16b, v7.16b, v1.16b
+        B.LO    7f
+
+        # Store full 4 x 16
+        ST1     {v7.16b},  [x7], x0
+        ST1     {v6.16b}, [x17], x0
+        ST1     {v5.16b}, [x16], x0
+        ST1     {v4.16b},  [x6], x0
+
+        SUB     x4, x4, x3              // a -= ks
+
+        # nc loop
+        B.HI    0b
+
+        # Restore d8-d15 from stack
+        LDP     d10, d11, [sp, 16]
+        LDP     d8,  d9, [sp], 32
+        RET
+
+        # Remainder- 4 to 12 bytes of A
+        # Although C4, its safe to read 16 bytes.
+        .p2align 3
+5:
+        AND     x0, x2, 15              // kc remainder 4 to 12
+6:
+        LDR     q0, [x13]
+        LDP     q8,  q9,  [x5], 32
+        LDR     q1, [x14]
+        LDR     q2, [x15]
+        LDR     q3, [x10]
+        LDP     q10, q11, [x5], 32
+        SDOT    v16.4s,  v8.16b, v0.4b[0]
+        SDOT    v17.4s,  v8.16b, v1.4b[0]
+        SDOT    v18.4s,  v8.16b, v2.4b[0]
+        SDOT    v19.4s,  v8.16b, v3.4b[0]
+        SDOT    v20.4s,  v9.16b, v0.4b[0]
+        SDOT    v21.4s,  v9.16b, v1.4b[0]
+        SDOT    v22.4s,  v9.16b, v2.4b[0]
+        SDOT    v23.4s,  v9.16b, v3.4b[0]
+        SDOT    v24.4s, v10.16b, v0.4b[0]
+        SDOT    v25.4s, v10.16b, v1.4b[0]
+        SDOT    v26.4s, v10.16b, v2.4b[0]
+        SDOT    v27.4s, v10.16b, v3.4b[0]
+        SDOT    v28.4s, v11.16b, v0.4b[0]
+        SDOT    v29.4s, v11.16b, v1.4b[0]
+        SDOT    v30.4s, v11.16b, v2.4b[0]
+        SDOT    v31.4s, v11.16b, v3.4b[0]
+        CMP     x0, 4
+        B.LS    4b
+        LDP     q8,  q9,  [x5], 32
+        LDP     q10, q11,  [x5], 32
+        SDOT    v16.4s,  v8.16b, v0.4b[1]
+        SDOT    v17.4s,  v8.16b, v1.4b[1]
+        SDOT    v18.4s,  v8.16b, v2.4b[1]
+        SDOT    v19.4s,  v8.16b, v3.4b[1]
+        SDOT    v20.4s,  v9.16b, v0.4b[1]
+        SDOT    v21.4s,  v9.16b, v1.4b[1]
+        SDOT    v22.4s,  v9.16b, v2.4b[1]
+        SDOT    v23.4s,  v9.16b, v3.4b[1]
+        SDOT    v24.4s, v10.16b, v0.4b[1]
+        SDOT    v25.4s, v10.16b, v1.4b[1]
+        SDOT    v26.4s, v10.16b, v2.4b[1]
+        SDOT    v27.4s, v10.16b, v3.4b[1]
+        SDOT    v28.4s, v11.16b, v0.4b[1]
+        SDOT    v29.4s, v11.16b, v1.4b[1]
+        SDOT    v30.4s, v11.16b, v2.4b[1]
+        SDOT    v31.4s, v11.16b, v3.4b[1]
+        CMP     x0, 8
+        B.LS    4b
+        LDP     q8,  q9,  [x5], 32
+        LDP     q10, q11,  [x5], 32
+        SDOT    v16.4s,  v8.16b, v0.4b[2]
+        SDOT    v17.4s,  v8.16b, v1.4b[2]
+        SDOT    v18.4s,  v8.16b, v2.4b[2]
+        SDOT    v19.4s,  v8.16b, v3.4b[2]
+        SDOT    v20.4s,  v9.16b, v0.4b[2]
+        SDOT    v21.4s,  v9.16b, v1.4b[2]
+        SDOT    v22.4s,  v9.16b, v2.4b[2]
+        SDOT    v23.4s,  v9.16b, v3.4b[2]
+        SDOT    v24.4s, v10.16b, v0.4b[2]
+        SDOT    v25.4s, v10.16b, v1.4b[2]
+        SDOT    v26.4s, v10.16b, v2.4b[2]
+        SDOT    v27.4s, v10.16b, v3.4b[2]
+        SDOT    v28.4s, v11.16b, v0.4b[2]
+        SDOT    v29.4s, v11.16b, v1.4b[2]
+        SDOT    v30.4s, v11.16b, v2.4b[2]
+        SDOT    v31.4s, v11.16b, v3.4b[2]
+        B       4b
+
+        # Store odd width
+        .p2align 3
+7:
+        TBZ     x1, 3, 8f
+        STR     d7, [x7], 8
+        DUP     d7, v7.d[1]
+        STR     d6, [x17], 8
+        DUP     d6, v6.d[1]
+        STR     d5, [x16], 8
+        DUP     d5, v5.d[1]
+        STR     d4, [x6], 8
+        DUP     d4, v4.d[1]
+8:
+        TBZ     x1, 2, 9f
+        STR     s7, [x7], 4
+        DUP     s7, v7.s[1]
+        STR     s6, [x17], 4
+        DUP     s6, v6.s[1]
+        STR     s5, [x16], 4
+        DUP     s5, v5.s[1]
+        STR     s4, [x6], 4
+        DUP     s4, v4.s[1]
+9:
+        TBZ     x1, 1, 10f
+        ST1     {v7.h}[0], [x7], 2
+        DUP     h7, v7.h[1]
+        ST1     {v6.h}[0], [x17], 2
+        DUP     h6, v6.h[1]
+        ST1     {v5.h}[0], [x16], 2
+        DUP     h5, v5.h[1]
+        ST1     {v4.h}[0], [x6], 2
+        DUP     h4, v4.h[1]
+10:
+        TBZ     x1, 0, 11f
+        ST1     {v7.b}[0], [x7]
+        ST1     {v6.b}[0], [x17]
+        ST1     {v5.b}[0], [x16]
+        ST1     {v4.b}[0], [x6]
+11:
+        # Restore d8-d15 from stack
+        LDP     d10, d11, [sp, 16]
+        LDP     d8,  d9, [sp], 32
+        RET
+
+END_FUNCTION xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/qs8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld128.S b/src/qs8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld128.S
new file mode 100644
index 0000000..351eef4
--- /dev/null
+++ b/src/qs8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld128.S
@@ -0,0 +1,440 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/4x16c4-aarch64-neondot-ld128.S.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <xnnpack/assembly.h>
+
+
+# void xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128(
+#     size_t mr,                 x0
+#     size_t nc,                 x1
+#     size_t kc,                 x2 / x0
+#     size_t ks,                 x3 / x9
+#     const int8_t**restrict a,  x4
+#     const int8_t* restrict w,  x5
+#     int8_t* restrict c,        x6
+#     size_t cm_stride,          x7
+#     size_t cn_stride,                  [sp] -> x10
+#     size_t a_offset,                   [sp + 8] -> x11
+#     const float* zero,                 [sp + 16] -> x12
+#     const union xnn_qs8_minmax_params params [sp + 24] -> (x8)
+
+# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+# Register usage
+# A0  x13  v0
+# A1  x14  v1
+# A2  x15  v2
+# A3   x8  v3
+# B    x5  v4  v5  v6  v7
+# C0   x6 v16 v20 v24 v28
+# C1  x16 v17 v21 v25 v29
+# C2  x17 v18 v22 v26 v30
+# C3   x7 v19 v23 v27 v31
+# unused v8 v9 v10 v11 v12 v13 v14 v15
+
+BEGIN_FUNCTION xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128
+
+        # Clamp C pointers
+        CMP     x0, 2                   // if mr < 2
+        LDP     x10, x11, [sp]          // Load cn_stride, a_offset
+        ADD     x16, x6, x7             // c1 = c0 + cm_stride
+        CSEL    x16, x6,  x16, LO       //   c1 = c0
+        ADD     x2, x2, 3               // kc = (kc + 3) & ~3
+
+        ADD     x17, x16, x7            // c2 = c1 + cm_stride
+        LDR     x12, [sp, 16]           // Load zero pointer
+                                        // if mr <= 2
+        CSEL    x17, x16, x17, LS       //   c2 = c1
+        BIC     x2, x2, 3
+
+        CMP     x0, 4                   // if mr < 4
+        ADD     x7,  x17, x7            // c3 = c2 + cm_stride
+        CSEL    x7,  x17, x7, LO        //   c3 = c2
+
+        .p2align 3
+0:
+        # Load initial bias from w into accumulators
+        LDP     q16, q20, [x5], 32
+        MOV     v17.16b, v16.16b
+        MOV     v18.16b, v16.16b
+        LDP     q24, q28, [x5], 32
+        MOV     v19.16b, v16.16b
+        MOV     v21.16b, v20.16b
+        MOV     v22.16b, v20.16b
+        MOV     v23.16b, v20.16b
+        MOV     v25.16b, v24.16b
+        MOV     v26.16b, v24.16b
+        MOV     v27.16b, v24.16b
+        MOV     v29.16b, v28.16b
+        MOV     v30.16b, v28.16b
+        MOV     v31.16b, v28.16b
+        MOV     x9, x3                  // p = ks
+
+        .p2align 3
+1:
+        # Load next 4 A pointers
+        LDP     x13, x14, [x4], 16
+        LDP     x15, x8, [x4], 16
+
+        CMP     x13, x12                // if a0 == zero
+        ADD     x13, x13, x11           // a0 += a_offset
+        CSEL    x13, x12, x13, EQ       //   a0 = zero, else += a0 + a_offset
+        CMP     x14, x12                // if a1 == zero
+        ADD     x14, x14, x11           // a1 += a_offset
+        CSEL    x14, x12, x14, EQ       //   a1 = zero, else += a1 + a_offset
+        CMP     x15, x12                // if a2 == zero
+        ADD     x15, x15, x11           // a2 += a_offset
+        CSEL    x15, x12, x15, EQ       //   a2 = zero, else += a2 + a_offset
+        CMP     x8, x12                 // if a3 == zero
+        ADD     x8, x8, x11             // a3 += a_offset
+        CSEL    x8, x12, x8, EQ         //   a3 = zero, else += a3 + a_offset
+
+        # Is there at least 16 bytes for main loop?
+        SUBS    x0, x2, 16              // k = kc - 16
+        B.LO    4f
+
+        # Main loop - 16 bytes of A
+        .p2align 3
+2:
+        LDR     q0, [x13], 16
+        LDR     q4,  [x5], 16
+        LDR     q1, [x14], 16
+        LDR     q2, [x15], 16
+        LDR     q3,  [x8], 16
+        LDR     q5,  [x5], 16
+        SDOT    v16.4s, v4.16b,  v0.4b[0]
+        SDOT    v17.4s, v4.16b,  v1.4b[0]
+        LDP     q6, q7, [x5], 32
+        SDOT    v18.4s, v4.16b,  v2.4b[0]
+        SDOT    v19.4s, v4.16b,  v3.4b[0]
+        SDOT    v20.4s, v5.16b,  v0.4b[0]
+        SDOT    v21.4s, v5.16b,  v1.4b[0]
+        SDOT    v22.4s, v5.16b,  v2.4b[0]
+        SDOT    v23.4s, v5.16b,  v3.4b[0]
+        SDOT    v24.4s, v6.16b, v0.4b[0]
+        SDOT    v25.4s, v6.16b, v1.4b[0]
+        LDP     q4, q5, [x5], 32
+        SDOT    v26.4s, v6.16b, v2.4b[0]
+        SDOT    v27.4s, v6.16b, v3.4b[0]
+        SDOT    v28.4s, v7.16b, v0.4b[0]
+        SDOT    v29.4s, v7.16b, v1.4b[0]
+        SDOT    v30.4s, v7.16b, v2.4b[0]
+        SDOT    v31.4s, v7.16b, v3.4b[0]
+
+        SDOT    v16.4s, v4.16b,  v0.4b[1]
+        SDOT    v17.4s, v4.16b,  v1.4b[1]
+        LDP     q6, q7, [x5], 32
+        SDOT    v18.4s, v4.16b,  v2.4b[1]
+        SDOT    v19.4s, v4.16b,  v3.4b[1]
+        SDOT    v20.4s, v5.16b,  v0.4b[1]
+        SDOT    v21.4s, v5.16b,  v1.4b[1]
+        SDOT    v22.4s, v5.16b,  v2.4b[1]
+        SDOT    v23.4s, v5.16b,  v3.4b[1]
+        SDOT    v24.4s, v6.16b,  v0.4b[1]
+        SDOT    v25.4s, v6.16b,  v1.4b[1]
+        LDP     q4, q5, [x5], 32
+        SDOT    v26.4s, v6.16b,  v2.4b[1]
+        SDOT    v27.4s, v6.16b,  v3.4b[1]
+        SDOT    v28.4s, v7.16b,  v0.4b[1]
+        SDOT    v29.4s, v7.16b,  v1.4b[1]
+        SDOT    v30.4s, v7.16b,  v2.4b[1]
+        SDOT    v31.4s, v7.16b,  v3.4b[1]
+
+        SDOT    v16.4s, v4.16b,  v0.4b[2]
+        SDOT    v17.4s, v4.16b,  v1.4b[2]
+        LDP     q6, q7, [x5], 32
+        SDOT    v18.4s, v4.16b,  v2.4b[2]
+        SDOT    v19.4s, v4.16b,  v3.4b[2]
+        SDOT    v20.4s, v5.16b,  v0.4b[2]
+        SDOT    v21.4s, v5.16b,  v1.4b[2]
+        SDOT    v22.4s, v5.16b,  v2.4b[2]
+        SDOT    v23.4s, v5.16b,  v3.4b[2]
+        SDOT    v24.4s, v6.16b,  v0.4b[2]
+        SDOT    v25.4s, v6.16b,  v1.4b[2]
+        LDP     q4, q5, [x5], 32
+        SDOT    v26.4s, v6.16b,  v2.4b[2]
+        SDOT    v27.4s, v6.16b,  v3.4b[2]
+        SDOT    v28.4s, v7.16b,  v0.4b[2]
+        SDOT    v29.4s, v7.16b,  v1.4b[2]
+        SDOT    v30.4s, v7.16b,  v2.4b[2]
+        SDOT    v31.4s, v7.16b,  v3.4b[2]
+
+        SDOT    v16.4s, v4.16b,  v0.4b[3]
+        SDOT    v17.4s, v4.16b,  v1.4b[3]
+        LDP     q6, q7, [x5], 32
+        SDOT    v18.4s, v4.16b,  v2.4b[3]
+        SDOT    v19.4s, v4.16b,  v3.4b[3]
+        SDOT    v20.4s, v5.16b,  v0.4b[3]
+        SDOT    v21.4s, v5.16b,  v1.4b[3]
+        SDOT    v22.4s, v5.16b,  v2.4b[3]
+        SDOT    v23.4s, v5.16b,  v3.4b[3]
+        SDOT    v24.4s, v6.16b,  v0.4b[3]
+        SDOT    v25.4s, v6.16b,  v1.4b[3]
+        SDOT    v26.4s, v6.16b,  v2.4b[3]
+        SDOT    v27.4s, v6.16b,  v3.4b[3]
+        SUBS    x0, x0, 16
+        SDOT    v28.4s, v7.16b,  v0.4b[3]
+        SDOT    v29.4s, v7.16b,  v1.4b[3]
+        SDOT    v30.4s, v7.16b,  v2.4b[3]
+        SDOT    v31.4s, v7.16b,  v3.4b[3]
+        B.HS    2b
+
+        # Is there a remainder?- 4 to 12 bytes of A
+        TST     x0, 15
+        B.NE    4f
+
+3:
+        # ks loop
+        SUBS    x9, x9, 32              // ks -= MR * sizeof(int8_t*)
+        B.HI    1b
+
+        LDR     x8, [sp, 24]            // reload params pointer
+
+        # Apply params - scale, bias and clamp
+        LD1R    {v0.4s}, [x8], 4
+
+        SCVTF   v16.4s, v16.4s
+        SCVTF   v17.4s, v17.4s
+        SCVTF   v18.4s, v18.4s
+        SCVTF   v19.4s, v19.4s
+        SCVTF   v20.4s, v20.4s
+        SCVTF   v21.4s, v21.4s
+        SCVTF   v22.4s, v22.4s
+        SCVTF   v23.4s, v23.4s
+        SCVTF   v24.4s, v24.4s
+        SCVTF   v25.4s, v25.4s
+        SCVTF   v26.4s, v26.4s
+        SCVTF   v27.4s, v27.4s
+        SCVTF   v28.4s, v28.4s
+        SCVTF   v29.4s, v29.4s
+        SCVTF   v30.4s, v30.4s
+        SCVTF   v31.4s, v31.4s
+
+        FMUL    v16.4s, v0.4s, v16.4s
+        FMUL    v17.4s, v0.4s, v17.4s
+        FMUL    v18.4s, v0.4s, v18.4s
+        FMUL    v19.4s, v0.4s, v19.4s
+        FMUL    v20.4s, v0.4s, v20.4s
+        FMUL    v21.4s, v0.4s, v21.4s
+        FMUL    v22.4s, v0.4s, v22.4s
+        FMUL    v23.4s, v0.4s, v23.4s
+        FMUL    v24.4s, v0.4s, v24.4s
+        FMUL    v25.4s, v0.4s, v25.4s
+        FMUL    v26.4s, v0.4s, v26.4s
+        FMUL    v27.4s, v0.4s, v27.4s
+        FMUL    v28.4s, v0.4s, v28.4s
+        FMUL    v29.4s, v0.4s, v29.4s
+        FMUL    v30.4s, v0.4s, v30.4s
+        FMUL    v31.4s, v0.4s, v31.4s
+
+        FCVTNS  v16.4s, v16.4s
+        FCVTNS  v17.4s, v17.4s
+        FCVTNS  v18.4s, v18.4s
+        FCVTNS  v19.4s, v19.4s
+        FCVTNS  v20.4s, v20.4s
+        FCVTNS  v21.4s, v21.4s
+        FCVTNS  v22.4s, v22.4s
+        FCVTNS  v23.4s, v23.4s
+        FCVTNS  v24.4s, v24.4s
+        FCVTNS  v25.4s, v25.4s
+        FCVTNS  v26.4s, v26.4s
+        FCVTNS  v27.4s, v27.4s
+        FCVTNS  v28.4s, v28.4s
+        FCVTNS  v29.4s, v29.4s
+        FCVTNS  v30.4s, v30.4s
+        FCVTNS  v31.4s, v31.4s
+
+
+        SQXTN   v16.4h, v16.4s
+        SQXTN   v17.4h, v17.4s
+        SQXTN   v18.4h, v18.4s
+        SQXTN   v19.4h, v19.4s
+        SQXTN   v24.4h, v24.4s
+        SQXTN   v25.4h, v25.4s
+        SQXTN   v26.4h, v26.4s
+        SQXTN   v27.4h, v27.4s
+        LD1R    {v2.8h}, [x8], 2       // add bias
+
+        SQXTN2  v16.8h, v20.4s
+        SQXTN2  v17.8h, v21.4s
+        SQXTN2  v18.8h, v22.4s
+        SQXTN2  v19.8h, v23.4s
+        SQXTN2  v24.8h, v28.4s
+        SQXTN2  v25.8h, v29.4s
+        SQXTN2  v26.8h, v30.4s
+        SQXTN2  v27.8h, v31.4s
+
+        SQADD   v16.8h, v16.8h, v2.8h
+        SQADD   v17.8h, v17.8h, v2.8h
+        SQADD   v18.8h, v18.8h, v2.8h
+        SQADD   v19.8h, v19.8h, v2.8h
+        SQADD   v24.8h, v24.8h, v2.8h
+        SQADD   v25.8h, v25.8h, v2.8h
+        SQADD   v26.8h, v26.8h, v2.8h
+        SQADD   v27.8h, v27.8h, v2.8h
+        LD1R    {v0.16b}, [x8], 1      // clamp min value
+
+        SQXTN   v4.8b, v16.8h
+        SQXTN   v5.8b, v17.8h
+        SQXTN   v6.8b, v18.8h
+        SQXTN   v7.8b, v19.8h
+        LD1R    {v1.16b}, [x8]         // clamp max value
+        SQXTN2  v4.16b, v24.8h
+        SQXTN2  v5.16b, v25.8h
+        SQXTN2  v6.16b, v26.8h
+        SQXTN2  v7.16b, v27.8h
+
+        SMAX    v4.16b, v4.16b, v0.16b
+        SMAX    v5.16b, v5.16b, v0.16b
+        SMAX    v6.16b, v6.16b, v0.16b
+        SMAX    v7.16b, v7.16b, v0.16b
+        SUBS    x1, x1, 16
+        SMIN    v4.16b, v4.16b, v1.16b
+        SMIN    v5.16b, v5.16b, v1.16b
+        SMIN    v6.16b, v6.16b, v1.16b
+        SMIN    v7.16b, v7.16b, v1.16b
+        B.LO    6f
+
+        # Store full 4 x 16
+        ST1     {v7.16b},  [x7], x10
+        ST1     {v6.16b}, [x17], x10
+        ST1     {v5.16b}, [x16], x10
+        ST1     {v4.16b},  [x6], x10
+
+        SUB     x4, x4, x3              // a -= ks
+
+        # nc loop
+        B.HI    0b
+        RET
+
+        # Remainder- 8 bytes of A
+        .p2align 3
+4:
+        # Is there a remainder?- 8 bytes of A
+        TBZ     x0, 3, 5f
+
+        LDR     d0, [x13], 8
+        LDR     q4,  [x5], 16
+        LDR     d1, [x14], 8
+        LDR     d2, [x15], 8
+        LDR     d3,  [x8], 8
+        LDR     q5,  [x5], 16
+        SDOT    v16.4s, v4.16b,  v0.4b[0]
+        SDOT    v17.4s, v4.16b,  v1.4b[0]
+        LDP     q6, q7, [x5], 32
+        SDOT    v18.4s, v4.16b,  v2.4b[0]
+        SDOT    v19.4s, v4.16b,  v3.4b[0]
+        SDOT    v20.4s, v5.16b,  v0.4b[0]
+        SDOT    v21.4s, v5.16b,  v1.4b[0]
+        SDOT    v22.4s, v5.16b,  v2.4b[0]
+        SDOT    v23.4s, v5.16b,  v3.4b[0]
+        SDOT    v24.4s, v6.16b, v0.4b[0]
+        SDOT    v25.4s, v6.16b, v1.4b[0]
+        LDP     q4, q5, [x5], 32
+        SDOT    v26.4s, v6.16b, v2.4b[0]
+        SDOT    v27.4s, v6.16b, v3.4b[0]
+        SDOT    v28.4s, v7.16b, v0.4b[0]
+        SDOT    v29.4s, v7.16b, v1.4b[0]
+        SDOT    v30.4s, v7.16b, v2.4b[0]
+        SDOT    v31.4s, v7.16b, v3.4b[0]
+        SDOT    v16.4s, v4.16b,  v0.4b[1]
+        SDOT    v17.4s, v4.16b,  v1.4b[1]
+        LDP     q6, q7, [x5], 32
+        SDOT    v18.4s, v4.16b,  v2.4b[1]
+        SDOT    v19.4s, v4.16b,  v3.4b[1]
+        SDOT    v20.4s, v5.16b,  v0.4b[1]
+        SDOT    v21.4s, v5.16b,  v1.4b[1]
+        SDOT    v22.4s, v5.16b,  v2.4b[1]
+        SDOT    v23.4s, v5.16b,  v3.4b[1]
+        SDOT    v24.4s, v6.16b,  v0.4b[1]
+        SDOT    v25.4s, v6.16b,  v1.4b[1]
+        SDOT    v26.4s, v6.16b,  v2.4b[1]
+        SDOT    v27.4s, v6.16b,  v3.4b[1]
+        SDOT    v28.4s, v7.16b,  v0.4b[1]
+        SDOT    v29.4s, v7.16b,  v1.4b[1]
+        SDOT    v30.4s, v7.16b,  v2.4b[1]
+        SDOT    v31.4s, v7.16b,  v3.4b[1]
+
+        # Remainder- 4 bytes of A
+5:
+        # Is there a remainder?- 4 bytes of A
+        TBZ     x0, 2, 3b
+
+        LDR     s0, [x13], 4
+        LDR     q4, [x5], 16
+        LDR     s1, [x14], 4
+        LDR     s2, [x15], 4
+        LDR     s3,  [x8], 4
+        LDR     q5, [x5], 16
+        SDOT    v16.4s, v4.16b,  v0.4b[0]
+        SDOT    v17.4s, v4.16b,  v1.4b[0]
+        LDP     q6, q7, [x5], 32
+        SDOT    v18.4s, v4.16b,  v2.4b[0]
+        SDOT    v19.4s, v4.16b,  v3.4b[0]
+        SDOT    v20.4s, v5.16b,  v0.4b[0]
+        SDOT    v21.4s, v5.16b,  v1.4b[0]
+        SDOT    v22.4s, v5.16b,  v2.4b[0]
+        SDOT    v23.4s, v5.16b,  v3.4b[0]
+        LDR     x8, [sp, 24]            // reload params pointer
+        SDOT    v24.4s, v6.16b, v0.4b[0]
+        SDOT    v25.4s, v6.16b, v1.4b[0]
+        SDOT    v26.4s, v6.16b, v2.4b[0]
+        SDOT    v27.4s, v6.16b, v3.4b[0]
+        SDOT    v28.4s, v7.16b, v0.4b[0]
+        SDOT    v29.4s, v7.16b, v1.4b[0]
+        SDOT    v30.4s, v7.16b, v2.4b[0]
+        SDOT    v31.4s, v7.16b, v3.4b[0]
+        B       3b
+
+        # Store odd width
+        .p2align 3
+6:
+        TBZ     x1, 3, 7f
+        STR     d7, [x7], 8
+        DUP     d7, v7.d[1]
+        STR     d6, [x17], 8
+        DUP     d6, v6.d[1]
+        STR     d5, [x16], 8
+        DUP     d5, v5.d[1]
+        STR     d4, [x6], 8
+        DUP     d4, v4.d[1]
+7:
+        TBZ     x1, 2, 8f
+        STR     s7, [x7], 4
+        DUP     s7, v7.s[1]
+        STR     s6, [x17], 4
+        DUP     s6, v6.s[1]
+        STR     s5, [x16], 4
+        DUP     s5, v5.s[1]
+        STR     s4, [x6], 4
+        DUP     s4, v4.s[1]
+8:
+        TBZ     x1, 1, 9f
+        ST1     {v7.h}[0], [x7], 2
+        DUP     h7, v7.h[1]
+        ST1     {v6.h}[0], [x17], 2
+        DUP     h6, v6.h[1]
+        ST1     {v5.h}[0], [x16], 2
+        DUP     h5, v5.h[1]
+        ST1     {v4.h}[0], [x6], 2
+        DUP     h4, v4.h[1]
+9:
+        TBZ     x1, 0, 10f
+        ST1     {v7.b}[0], [x7]
+        ST1     {v6.b}[0], [x17]
+        ST1     {v5.b}[0], [x16]
+        ST1     {v4.b}[0], [x6]
+10:
+        RET
+
+END_FUNCTION xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/qs8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld64.S b/src/qs8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld64.S
new file mode 100644
index 0000000..f6cf7f5
--- /dev/null
+++ b/src/qs8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld64.S
@@ -0,0 +1,353 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/4x16c4-aarch64-neondot-ld64.S.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <xnnpack/assembly.h>
+
+
+# void xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64(
+#     size_t mr,                 x0
+#     size_t nc,                 x1
+#     size_t kc,                 x2 / x0
+#     size_t ks,                 x3 / x9
+#     const int8_t**restrict a,  x4
+#     const int8_t* restrict w,  x5
+#     int8_t* restrict c,        x6
+#     size_t cm_stride,          x7
+#     size_t cn_stride,                  [sp] -> x10
+#     size_t a_offset,                   [sp + 8] -> x11
+#     const float* zero,                 [sp + 16] -> x12
+#     const union xnn_qs8_minmax_params params [sp + 24] -> (x8)
+
+# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+# Register usage
+# A0  x13  v0
+# A1  x14  v1
+# A2  x15  v2
+# A3   x8  v3
+# B    x5  v4  v5  v6  v7
+# C0   x6 v16 v20 v24 v28
+# C1  x16 v17 v21 v25 v29
+# C2  x17 v18 v22 v26 v30
+# C3   x7 v19 v23 v27 v31
+# unused v8 v9 v10 v11 v12 v13 v14 v15
+
+BEGIN_FUNCTION xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64
+
+        # Clamp C pointers
+        CMP     x0, 2                   // if mr < 2
+        LDP     x10, x11, [sp]          // Load cn_stride, a_offset
+        ADD     x16, x6, x7             // c1 = c0 + cm_stride
+        CSEL    x16, x6,  x16, LO       //   c1 = c0
+        ADD     x2, x2, 3               // kc = (kc + 3) & ~3
+
+        ADD     x17, x16, x7            // c2 = c1 + cm_stride
+        LDR     x12, [sp, 16]           // Load zero pointer
+                                        // if mr <= 2
+        CSEL    x17, x16, x17, LS       //   c2 = c1
+        BIC     x2, x2, 3
+
+        CMP     x0, 4                   // if mr < 4
+        ADD     x7,  x17, x7            // c3 = c2 + cm_stride
+        CSEL    x7,  x17, x7, LO        //   c3 = c2
+
+        .p2align 3
+0:
+        # Load initial bias from w into accumulators
+        LDP     q16, q20, [x5], 32
+        MOV     v17.16b, v16.16b
+        MOV     v18.16b, v16.16b
+        LDP     q24, q28, [x5], 32
+        MOV     v19.16b, v16.16b
+        MOV     v21.16b, v20.16b
+        MOV     v22.16b, v20.16b
+        MOV     v23.16b, v20.16b
+        MOV     v25.16b, v24.16b
+        MOV     v26.16b, v24.16b
+        MOV     v27.16b, v24.16b
+        MOV     v29.16b, v28.16b
+        MOV     v30.16b, v28.16b
+        MOV     v31.16b, v28.16b
+        MOV     x9, x3                  // p = ks
+
+        .p2align 3
+1:
+        # Load next 4 A pointers
+        LDP     x13, x14, [x4], 16
+        LDP     x15, x8, [x4], 16
+
+        CMP     x13, x12                // if a0 == zero
+        ADD     x13, x13, x11           // a0 += a_offset
+        CSEL    x13, x12, x13, EQ       //   a0 = zero, else += a0 + a_offset
+        CMP     x14, x12                // if a1 == zero
+        ADD     x14, x14, x11           // a1 += a_offset
+        CSEL    x14, x12, x14, EQ       //   a1 = zero, else += a1 + a_offset
+        CMP     x15, x12                // if a2 == zero
+        ADD     x15, x15, x11           // a2 += a_offset
+        CSEL    x15, x12, x15, EQ       //   a2 = zero, else += a2 + a_offset
+        CMP     x8, x12                 // if a3 == zero
+        ADD     x8, x8, x11             // a3 += a_offset
+        CSEL    x8, x12, x8, EQ         //   a3 = zero, else += a3 + a_offset
+
+        # Is there at least 8 bytes for main loop?
+        SUBS    x0, x2, 8               // k = kc - 8
+        B.LO    4f
+
+        # Main loop - 8 bytes of A
+        .p2align 3
+2:
+        LDR     d0, [x13], 8
+        LDR     q4,  [x5], 16
+        LDR     d1, [x14], 8
+        LDR     d2, [x15], 8
+        LDR     d3, [x8], 8
+        LDR     q5,  [x5], 16
+        SDOT    v16.4s, v4.16b,  v0.4b[0]
+        SDOT    v17.4s, v4.16b,  v1.4b[0]
+        LDP     q6, q7, [x5], 32
+        SDOT    v18.4s, v4.16b,  v2.4b[0]
+        SDOT    v19.4s, v4.16b,  v3.4b[0]
+        SDOT    v20.4s, v5.16b,  v0.4b[0]
+        SDOT    v21.4s, v5.16b,  v1.4b[0]
+        SDOT    v22.4s, v5.16b,  v2.4b[0]
+        SDOT    v23.4s, v5.16b,  v3.4b[0]
+        SDOT    v24.4s, v6.16b, v0.4b[0]
+        SDOT    v25.4s, v6.16b, v1.4b[0]
+        LDP     q4, q5, [x5], 32
+        SDOT    v26.4s, v6.16b, v2.4b[0]
+        SDOT    v27.4s, v6.16b, v3.4b[0]
+        SDOT    v28.4s, v7.16b, v0.4b[0]
+        SDOT    v29.4s, v7.16b, v1.4b[0]
+        SDOT    v30.4s, v7.16b, v2.4b[0]
+        SDOT    v31.4s, v7.16b, v3.4b[0]
+        SDOT    v16.4s, v4.16b,  v0.4b[1]
+        SDOT    v17.4s, v4.16b,  v1.4b[1]
+        LDP     q6, q7, [x5], 32
+        SDOT    v18.4s, v4.16b,  v2.4b[1]
+        SDOT    v19.4s, v4.16b,  v3.4b[1]
+        SDOT    v20.4s, v5.16b,  v0.4b[1]
+        SDOT    v21.4s, v5.16b,  v1.4b[1]
+        SDOT    v22.4s, v5.16b,  v2.4b[1]
+        SDOT    v23.4s, v5.16b,  v3.4b[1]
+        SDOT    v24.4s, v6.16b,  v0.4b[1]
+        SDOT    v25.4s, v6.16b,  v1.4b[1]
+        SDOT    v26.4s, v6.16b,  v2.4b[1]
+        SDOT    v27.4s, v6.16b,  v3.4b[1]
+        SDOT    v28.4s, v7.16b,  v0.4b[1]
+        SDOT    v29.4s, v7.16b,  v1.4b[1]
+        SDOT    v30.4s, v7.16b,  v2.4b[1]
+        SUBS    x0, x0, 8
+        SDOT    v31.4s, v7.16b,  v3.4b[1]
+        B.HS    2b
+
+        # Is there a remainder?- 4 bytes of A
+        TBNZ    x0, 2, 4f
+
+        LDR     x8, [sp, 24]            // reload params pointer
+
+        # ks loop
+        SUBS    x9, x9, 32              // ks -= MR * sizeof(int8_t*)
+        B.HI    1b
+
+3:
+        # Apply params - scale, bias and clamp
+        LD1R    {v0.4s}, [x8], 4
+
+        SCVTF   v16.4s, v16.4s
+        SCVTF   v17.4s, v17.4s
+        SCVTF   v18.4s, v18.4s
+        SCVTF   v19.4s, v19.4s
+        SCVTF   v20.4s, v20.4s
+        SCVTF   v21.4s, v21.4s
+        SCVTF   v22.4s, v22.4s
+        SCVTF   v23.4s, v23.4s
+        SCVTF   v24.4s, v24.4s
+        SCVTF   v25.4s, v25.4s
+        SCVTF   v26.4s, v26.4s
+        SCVTF   v27.4s, v27.4s
+        SCVTF   v28.4s, v28.4s
+        SCVTF   v29.4s, v29.4s
+        SCVTF   v30.4s, v30.4s
+        SCVTF   v31.4s, v31.4s
+
+        FMUL    v16.4s, v0.4s, v16.4s
+        FMUL    v17.4s, v0.4s, v17.4s
+        FMUL    v18.4s, v0.4s, v18.4s
+        FMUL    v19.4s, v0.4s, v19.4s
+        FMUL    v20.4s, v0.4s, v20.4s
+        FMUL    v21.4s, v0.4s, v21.4s
+        FMUL    v22.4s, v0.4s, v22.4s
+        FMUL    v23.4s, v0.4s, v23.4s
+        FMUL    v24.4s, v0.4s, v24.4s
+        FMUL    v25.4s, v0.4s, v25.4s
+        FMUL    v26.4s, v0.4s, v26.4s
+        FMUL    v27.4s, v0.4s, v27.4s
+        FMUL    v28.4s, v0.4s, v28.4s
+        FMUL    v29.4s, v0.4s, v29.4s
+        FMUL    v30.4s, v0.4s, v30.4s
+        FMUL    v31.4s, v0.4s, v31.4s
+
+        FCVTNS  v16.4s, v16.4s
+        FCVTNS  v17.4s, v17.4s
+        FCVTNS  v18.4s, v18.4s
+        FCVTNS  v19.4s, v19.4s
+        FCVTNS  v20.4s, v20.4s
+        FCVTNS  v21.4s, v21.4s
+        FCVTNS  v22.4s, v22.4s
+        FCVTNS  v23.4s, v23.4s
+        FCVTNS  v24.4s, v24.4s
+        FCVTNS  v25.4s, v25.4s
+        FCVTNS  v26.4s, v26.4s
+        FCVTNS  v27.4s, v27.4s
+        FCVTNS  v28.4s, v28.4s
+        FCVTNS  v29.4s, v29.4s
+        FCVTNS  v30.4s, v30.4s
+        FCVTNS  v31.4s, v31.4s
+
+        SQXTN   v16.4h, v16.4s
+        SQXTN   v17.4h, v17.4s
+        SQXTN   v18.4h, v18.4s
+        SQXTN   v19.4h, v19.4s
+        SQXTN   v24.4h, v24.4s
+        SQXTN   v25.4h, v25.4s
+        SQXTN   v26.4h, v26.4s
+        SQXTN   v27.4h, v27.4s
+        LD1R    {v2.8h}, [x8], 2        // add bias
+
+        SQXTN2  v16.8h, v20.4s
+        SQXTN2  v17.8h, v21.4s
+        SQXTN2  v18.8h, v22.4s
+        SQXTN2  v19.8h, v23.4s
+        SQXTN2  v24.8h, v28.4s
+        SQXTN2  v25.8h, v29.4s
+        SQXTN2  v26.8h, v30.4s
+        SQXTN2  v27.8h, v31.4s
+
+        SQADD   v16.8h, v16.8h, v2.8h
+        SQADD   v17.8h, v17.8h, v2.8h
+        SQADD   v18.8h, v18.8h, v2.8h
+        SQADD   v19.8h, v19.8h, v2.8h
+        SQADD   v24.8h, v24.8h, v2.8h
+        SQADD   v25.8h, v25.8h, v2.8h
+        SQADD   v26.8h, v26.8h, v2.8h
+        SQADD   v27.8h, v27.8h, v2.8h
+        LD1R    {v0.16b}, [x8], 1       // clamp min value
+
+        SQXTN   v4.8b, v16.8h
+        SQXTN   v5.8b, v17.8h
+        SQXTN   v6.8b, v18.8h
+        SQXTN   v7.8b, v19.8h
+        LD1R    {v1.16b}, [x8]          // clamp max value
+        SQXTN2  v4.16b, v24.8h
+        SQXTN2  v5.16b, v25.8h
+        SQXTN2  v6.16b, v26.8h
+        SQXTN2  v7.16b, v27.8h
+
+        SMAX    v4.16b, v4.16b, v0.16b
+        SMAX    v5.16b, v5.16b, v0.16b
+        SMAX    v6.16b, v6.16b, v0.16b
+        SMAX    v7.16b, v7.16b, v0.16b
+        SUBS    x1, x1, 16
+        SMIN    v4.16b, v4.16b, v1.16b
+        SMIN    v5.16b, v5.16b, v1.16b
+        SMIN    v6.16b, v6.16b, v1.16b
+        SMIN    v7.16b, v7.16b, v1.16b
+        B.LO    5f
+
+        # Store full 4 x 16
+        ST1     {v7.16b},  [x7], x10
+        ST1     {v6.16b}, [x17], x10
+        ST1     {v5.16b}, [x16], x10
+        ST1     {v4.16b},  [x6], x10
+
+        SUB     x4, x4, x3              // a -= ks
+
+        # nc loop
+        B.HI    0b
+        RET
+
+        # Remainder- 4 bytes of A
+        .p2align 3
+4:
+        LDR     s0, [x13], 4
+        LDR     q4, [x5], 16
+        LDR     s1, [x14], 4
+        LDR     s2, [x15], 4
+        LDR     s3, [x8], 4
+        LDR     q5, [x5], 16
+        SDOT    v16.4s, v4.16b,  v0.4b[0]
+        SDOT    v17.4s, v4.16b,  v1.4b[0]
+        LDP     q6, q7, [x5], 32
+        SDOT    v18.4s, v4.16b,  v2.4b[0]
+        SDOT    v19.4s, v4.16b,  v3.4b[0]
+        SDOT    v20.4s, v5.16b,  v0.4b[0]
+        SDOT    v21.4s, v5.16b,  v1.4b[0]
+        SDOT    v22.4s, v5.16b,  v2.4b[0]
+        SDOT    v23.4s, v5.16b,  v3.4b[0]
+        LDR     x8, [sp, 24]            // reload params pointer
+        SDOT    v24.4s, v6.16b, v0.4b[0]
+        SDOT    v25.4s, v6.16b, v1.4b[0]
+        SDOT    v26.4s, v6.16b, v2.4b[0]
+        SDOT    v27.4s, v6.16b, v3.4b[0]
+        SDOT    v28.4s, v7.16b, v0.4b[0]
+        SDOT    v29.4s, v7.16b, v1.4b[0]
+        SDOT    v30.4s, v7.16b, v2.4b[0]
+        SDOT    v31.4s, v7.16b, v3.4b[0]
+
+        # ks loop
+        SUBS    x9, x9, 32              // ks -= MR * sizeof(int8_t*)
+        B.HI    1b
+        B       3b
+
+        # Store odd width
+        .p2align 3
+5:
+        TBZ     x1, 3, 6f
+        STR     d7, [x7], 8
+        DUP     d7, v7.d[1]
+        STR     d6, [x17], 8
+        DUP     d6, v6.d[1]
+        STR     d5, [x16], 8
+        DUP     d5, v5.d[1]
+        STR     d4, [x6], 8
+        DUP     d4, v4.d[1]
+6:
+        TBZ     x1, 2, 7f
+        STR     s7, [x7], 4
+        DUP     s7, v7.s[1]
+        STR     s6, [x17], 4
+        DUP     s6, v6.s[1]
+        STR     s5, [x16], 4
+        DUP     s5, v5.s[1]
+        STR     s4, [x6], 4
+        DUP     s4, v4.s[1]
+7:
+        TBZ     x1, 1, 8f
+        ST1     {v7.h}[0], [x7], 2
+        DUP     h7, v7.h[1]
+        ST1     {v6.h}[0], [x17], 2
+        DUP     h6, v6.h[1]
+        ST1     {v5.h}[0], [x16], 2
+        DUP     h5, v5.h[1]
+        ST1     {v4.h}[0], [x6], 2
+        DUP     h4, v4.h[1]
+8:
+        TBZ     x1, 0, 9f
+        ST1     {v7.b}[0], [x7]
+        ST1     {v6.b}[0], [x17]
+        ST1     {v5.b}[0], [x16]
+        ST1     {v4.b}[0], [x6]
+9:
+        RET
+
+END_FUNCTION xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/qs8-igemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-cortex-a55.S b/src/qs8-igemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-cortex-a55.S
index c7cbfee..087dfbe 100644
--- a/src/qs8-igemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-cortex-a55.S
+++ b/src/qs8-igemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-cortex-a55.S
@@ -9,6 +9,7 @@
 
 #include <xnnpack/assembly.h>
 
+
 # void xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55(
 #     size_t mr,                 x0
 #     size_t nc,                 x1
@@ -21,7 +22,7 @@
 #     size_t cn_stride,                  [sp] -> (x0)
 #     size_t a_offset,                   [sp + 8] -> x11
 #     const float* zero,                 [sp + 16] -> x12
-#     const xnn_qs8_conv_minmax_params params [sp + 24] -> (x8)
+#     const union xnn_qs8_minmax_params params [sp + 24] -> (x8)
 
 # d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
 
@@ -433,73 +434,70 @@
         B.HI    1b
 
         # Apply params - scale, shift, bias and clamp
-        LD1R    {v0.4s}, [x8], 4
-        SQRDMULH v4.4s, v16.4s, v0.4s
-        SQRDMULH v5.4s, v17.4s, v0.4s
-        LD1R    {v1.4s}, [x8], 4
-        SQRDMULH v6.4s, v18.4s, v0.4s
-        SQRDMULH v7.4s, v19.4s, v0.4s
-        SQRDMULH v8.4s, v20.4s, v0.4s
-        SQRDMULH v9.4s, v21.4s, v0.4s
+        LD2R    {v0.4s, v1.4s}, [x8], 8
         CMEQ    v2.4s, v1.4s, 0
-        SQRDMULH v10.4s, v22.4s, v0.4s
-        SQRDMULH v11.4s, v23.4s, v0.4s
 
-        BIC     v16.16b, v16.16b, v2.16b
-        BIC     v17.16b, v17.16b, v2.16b
-        BIC     v18.16b, v18.16b, v2.16b
-        BIC     v19.16b, v19.16b, v2.16b
-        BIC     v20.16b, v20.16b, v2.16b
-        BIC     v21.16b, v21.16b, v2.16b
-        BIC     v22.16b, v22.16b, v2.16b
-        BIC     v23.16b, v23.16b, v2.16b
+        BIC     v4.16b, v16.16b, v2.16b
+        BIC     v5.16b, v17.16b, v2.16b
+        BIC     v6.16b, v18.16b, v2.16b
+        BIC     v7.16b, v19.16b, v2.16b
 
-        SSRA    v4.4s, v16.4s, 31       // signed shift right accumulate
-        SSRA    v5.4s, v17.4s, 31
-        SSRA    v6.4s, v18.4s, 31
-        SSRA    v7.4s, v19.4s, 31
-        SSRA    v8.4s, v20.4s, 31
-        SSRA    v9.4s, v21.4s, 31
-        SSRA    v10.4s, v22.4s, 31
-        SSRA    v11.4s, v23.4s, 31
+        SQRDMULH v16.4s, v16.4s, v0.4s
+        SQRDMULH v17.4s, v17.4s, v0.4s
+        SQRDMULH v18.4s, v18.4s, v0.4s
+        SQRDMULH v19.4s, v19.4s, v0.4s
 
-        SQRDMULH v16.4s, v24.4s, v0.4s
-        SQRDMULH v17.4s, v25.4s, v0.4s
-        SQRDMULH v18.4s, v26.4s, v0.4s
-        SQRDMULH v19.4s, v27.4s, v0.4s
-        SQRDMULH v20.4s, v28.4s, v0.4s
-        SQRDMULH v21.4s, v29.4s, v0.4s
-        SQRDMULH v22.4s, v30.4s, v0.4s
-        SQRDMULH v23.4s, v31.4s, v0.4s
+        SSRA    v16.4s, v4.4s, 31       // signed shift right accumulate
+        SSRA    v17.4s, v5.4s, 31
+        SSRA    v18.4s, v6.4s, 31
+        SSRA    v19.4s, v7.4s, 31
 
-        BIC     v24.16b, v24.16b, v2.16b
-        BIC     v25.16b, v25.16b, v2.16b
-        BIC     v26.16b, v26.16b, v2.16b
-        BIC     v27.16b, v27.16b, v2.16b
-        BIC     v28.16b, v28.16b, v2.16b
-        BIC     v29.16b, v29.16b, v2.16b
-        BIC     v30.16b, v30.16b, v2.16b
-        BIC     v31.16b, v31.16b, v2.16b
+        BIC     v4.16b, v20.16b, v2.16b
+        BIC     v5.16b, v21.16b, v2.16b
+        BIC     v6.16b, v22.16b, v2.16b
+        BIC     v7.16b, v23.16b, v2.16b
 
-        SSRA    v16.4s, v24.4s, 31
-        SSRA    v17.4s, v25.4s, 31
-        SSRA    v18.4s, v26.4s, 31
-        SSRA    v19.4s, v27.4s, 31
-        SSRA    v20.4s, v28.4s, 31
-        SSRA    v21.4s, v29.4s, 31
-        SSRA    v22.4s, v30.4s, 31
-        SSRA    v23.4s, v31.4s, 31
+        SQRDMULH v20.4s, v20.4s, v0.4s
+        SQRDMULH v21.4s, v21.4s, v0.4s
+        SQRDMULH v22.4s, v22.4s, v0.4s
+        SQRDMULH v23.4s, v23.4s, v0.4s
 
-        SRSHL   v4.4s,  v4.4s, v1.4s    // signed rounding shift left
-        SRSHL   v5.4s,  v5.4s, v1.4s
-        SRSHL   v6.4s,  v6.4s, v1.4s
-        SRSHL   v7.4s,  v7.4s, v1.4s
-        SRSHL   v8.4s,  v8.4s, v1.4s
-        SRSHL   v9.4s,  v9.4s, v1.4s
-        SRSHL   v10.4s, v10.4s, v1.4s
-        SRSHL   v11.4s, v11.4s, v1.4s
+        SSRA    v20.4s, v4.4s, 31
+        SSRA    v21.4s, v5.4s, 31
+        SSRA    v22.4s, v6.4s, 31
+        SSRA    v23.4s, v7.4s, 31
 
-        SRSHL   v16.4s, v16.4s, v1.4s
+        BIC     v4.16b, v24.16b, v2.16b
+        BIC     v5.16b, v25.16b, v2.16b
+        BIC     v6.16b, v26.16b, v2.16b
+        BIC     v7.16b, v27.16b, v2.16b
+
+        SQRDMULH v24.4s, v24.4s, v0.4s
+        SQRDMULH v25.4s, v25.4s, v0.4s
+        SQRDMULH v26.4s, v26.4s, v0.4s
+        SQRDMULH v27.4s, v27.4s, v0.4s
+
+        SSRA    v24.4s, v4.4s, 31
+        SSRA    v25.4s, v5.4s, 31
+        SSRA    v26.4s, v6.4s, 31
+        SSRA    v27.4s, v7.4s, 31
+
+        BIC     v4.16b, v28.16b, v2.16b
+        BIC     v5.16b, v29.16b, v2.16b
+        BIC     v6.16b, v30.16b, v2.16b
+        BIC     v7.16b, v31.16b, v2.16b
+
+        SQRDMULH v28.4s, v28.4s, v0.4s
+        SQRDMULH v29.4s, v29.4s, v0.4s
+        SQRDMULH v30.4s, v30.4s, v0.4s
+        SQRDMULH v31.4s, v31.4s, v0.4s
+
+        SSRA    v28.4s, v4.4s, 31
+        SSRA    v29.4s, v5.4s, 31
+        SSRA    v30.4s, v6.4s, 31
+        SSRA    v31.4s, v7.4s, 31
+
+        SRSHL   v16.4s, v16.4s, v1.4s   // signed rounding shift left
         SRSHL   v17.4s, v17.4s, v1.4s
         SRSHL   v18.4s, v18.4s, v1.4s
         SRSHL   v19.4s, v19.4s, v1.4s
@@ -507,57 +505,64 @@
         SRSHL   v21.4s, v21.4s, v1.4s
         SRSHL   v22.4s, v22.4s, v1.4s
         SRSHL   v23.4s, v23.4s, v1.4s
+        SRSHL   v24.4s, v24.4s, v1.4s
+        SRSHL   v25.4s, v25.4s, v1.4s
+        SRSHL   v26.4s, v26.4s, v1.4s
+        SRSHL   v27.4s, v27.4s, v1.4s
+        SRSHL   v28.4s, v28.4s, v1.4s
+        SRSHL   v29.4s, v29.4s, v1.4s
+        SRSHL   v30.4s, v30.4s, v1.4s
+        SRSHL   v31.4s, v31.4s, v1.4s
 
-        SQXTN   v4.4h,  v4.4s
-        SQXTN   v5.4h,  v5.4s
-        SQXTN   v6.4h,  v6.4s
-        SQXTN   v7.4h,  v7.4s
         SQXTN   v16.4h, v16.4s
         SQXTN   v17.4h, v17.4s
         SQXTN   v18.4h, v18.4s
         SQXTN   v19.4h, v19.4s
-        LD1R    {v2.8h}, [x8], 2        // add bias
+        SQXTN   v24.4h, v24.4s
+        SQXTN   v25.4h, v25.4s
+        SQXTN   v26.4h, v26.4s
+        SQXTN   v27.4h, v27.4s
+        LD1R    {v2.8h}, [x8], 2       // add bias
 
-        SQXTN2  v4.8h,  v8.4s
-        SQXTN2  v5.8h,  v9.4s
-        SQXTN2  v6.8h, v10.4s
-        SQXTN2  v7.8h, v11.4s
         SQXTN2  v16.8h, v20.4s
         SQXTN2  v17.8h, v21.4s
         SQXTN2  v18.8h, v22.4s
         SQXTN2  v19.8h, v23.4s
+        SQXTN2  v24.8h, v28.4s
+        SQXTN2  v25.8h, v29.4s
+        SQXTN2  v26.8h, v30.4s
+        SQXTN2  v27.8h, v31.4s
 
-        SQADD   v4.8h,  v4.8h, v2.8h
-        SQADD   v5.8h,  v5.8h, v2.8h
-        SQADD   v6.8h,  v6.8h, v2.8h
-        SQADD   v7.8h,  v7.8h, v2.8h
         SQADD   v16.8h, v16.8h, v2.8h
         SQADD   v17.8h, v17.8h, v2.8h
         SQADD   v18.8h, v18.8h, v2.8h
         SQADD   v19.8h, v19.8h, v2.8h
-        LD1R    {v0.16b}, [x8], 1       // clamp min value
+        SQADD   v24.8h, v24.8h, v2.8h
+        SQADD   v25.8h, v25.8h, v2.8h
+        SQADD   v26.8h, v26.8h, v2.8h
+        SQADD   v27.8h, v27.8h, v2.8h
+        LD1R    {v0.16b}, [x8], 1      // clamp min value
 
-        SQXTN   v4.8b,  v4.8h
-        SQXTN   v5.8b,  v5.8h
-        SQXTN   v6.8b,  v6.8h
-        SQXTN   v7.8b,  v7.8h
-        LD1R    {v1.16b}, [x8]          // clamp max value
-        SQXTN2  v4.16b, v16.8h
-        SQXTN2  v5.16b, v17.8h
-        SQXTN2  v6.16b, v18.8h
-        SQXTN2  v7.16b, v19.8h
+        SQXTN   v4.8b, v16.8h
+        SQXTN   v5.8b, v17.8h
+        SQXTN   v6.8b, v18.8h
+        SQXTN   v7.8b, v19.8h
+        LD1R    {v1.16b}, [x8]         // clamp max value
+        SQXTN2  v4.16b, v24.8h
+        SQXTN2  v5.16b, v25.8h
+        SQXTN2  v6.16b, v26.8h
+        SQXTN2  v7.16b, v27.8h
         SUB     x8, x8, 11              // rewind params pointer
-
-        SMAX    v4.16b,  v4.16b, v0.16b
-        SMAX    v5.16b,  v5.16b, v0.16b
-        LDR     x0, [sp, 32]            // Load cn_stride
-        SMAX    v6.16b,  v6.16b, v0.16b
-        SMAX    v7.16b,  v7.16b, v0.16b
+        SMAX    v4.16b, v4.16b, v0.16b
+        SMAX    v5.16b, v5.16b, v0.16b
+        LDR     x0, [sp, 32]           // cn_stride
+        SMAX    v6.16b, v6.16b, v0.16b
+        SMAX    v7.16b, v7.16b, v0.16b
         SUBS    x1, x1, 16
-        SMIN    v4.16b,  v4.16b, v1.16b
-        SMIN    v5.16b,  v5.16b, v1.16b
-        SMIN    v6.16b,  v6.16b, v1.16b
-        SMIN    v7.16b,  v7.16b, v1.16b
+        SMIN    v4.16b, v4.16b, v1.16b
+        SMIN    v5.16b, v5.16b, v1.16b
+        SMIN    v6.16b, v6.16b, v1.16b
+        SMIN    v7.16b, v7.16b, v1.16b
         B.LO    7f
 
         # Store full 4 x 16
diff --git a/src/qs8-igemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld128.S b/src/qs8-igemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld128.S
index d41ebba..99d43ee 100644
--- a/src/qs8-igemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld128.S
+++ b/src/qs8-igemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld128.S
@@ -9,6 +9,7 @@
 
 #include <xnnpack/assembly.h>
 
+
 # void xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128(
 #     size_t mr,                 x0
 #     size_t nc,                 x1
@@ -21,7 +22,7 @@
 #     size_t cn_stride,                  [sp] -> x10
 #     size_t a_offset,                   [sp + 8] -> x11
 #     const float* zero,                 [sp + 16] -> x12
-#     const xnn_qs8_conv_minmax_params params [sp + 24] -> (x8)
+#     const union xnn_qs8_minmax_params params [sp + 24] -> (x8)
 
 # d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
 
@@ -276,6 +277,7 @@
         SRSHL   v30.4s, v30.4s, v1.4s
         SRSHL   v31.4s, v31.4s, v1.4s
 
+
         SQXTN   v16.4h, v16.4s
         SQXTN   v17.4h, v17.4s
         SQXTN   v18.4h, v18.4s
@@ -284,7 +286,7 @@
         SQXTN   v25.4h, v25.4s
         SQXTN   v26.4h, v26.4s
         SQXTN   v27.4h, v27.4s
-        LD1R    {v2.8h}, [x8], 2        // add bias
+        LD1R    {v2.8h}, [x8], 2       // add bias
 
         SQXTN2  v16.8h, v20.4s
         SQXTN2  v17.8h, v21.4s
@@ -303,13 +305,13 @@
         SQADD   v25.8h, v25.8h, v2.8h
         SQADD   v26.8h, v26.8h, v2.8h
         SQADD   v27.8h, v27.8h, v2.8h
-        LD1R    {v0.16b}, [x8], 1       // clamp min value
+        LD1R    {v0.16b}, [x8], 1      // clamp min value
 
         SQXTN   v4.8b, v16.8h
         SQXTN   v5.8b, v17.8h
         SQXTN   v6.8b, v18.8h
         SQXTN   v7.8b, v19.8h
-        LD1R    {v1.16b}, [x8]          // clamp max value
+        LD1R    {v1.16b}, [x8]         // clamp max value
         SQXTN2  v4.16b, v24.8h
         SQXTN2  v5.16b, v25.8h
         SQXTN2  v6.16b, v26.8h
diff --git a/src/qs8-igemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld64.S b/src/qs8-igemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld64.S
index 48b995e..a188893 100644
--- a/src/qs8-igemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld64.S
+++ b/src/qs8-igemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld64.S
@@ -9,6 +9,7 @@
 
 #include <xnnpack/assembly.h>
 
+
 # void xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64(
 #     size_t mr,                 x0
 #     size_t nc,                 x1
@@ -21,7 +22,7 @@
 #     size_t cn_stride,                  [sp] -> x10
 #     size_t a_offset,                   [sp + 8] -> x11
 #     const float* zero,                 [sp + 16] -> x12
-#     const xnn_qs8_conv_minmax_params params [sp + 24] -> (x8)
+#     const union xnn_qs8_minmax_params params [sp + 24] -> (x8)
 
 # d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
 
diff --git a/src/xnnpack/gemm.h b/src/xnnpack/gemm.h
index d13693b..aa9062c 100644
--- a/src/xnnpack/gemm.h
+++ b/src/xnnpack/gemm.h
@@ -648,8 +648,6 @@
 DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53)
 DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53)
 
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot)
 DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32)
 DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64)
 DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32)
@@ -657,7 +655,12 @@
 DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128)
 DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55)
 
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64)
 DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55)
 
 DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x4c2__sse2_ld64)
 DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c2__sse2_ld64)
diff --git a/src/xnnpack/igemm.h b/src/xnnpack/igemm.h
index 4d17ea9..6cc5ceb 100644
--- a/src/xnnpack/igemm.h
+++ b/src/xnnpack/igemm.h
@@ -462,6 +462,10 @@
 DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128)
 DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55)
 
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55)
+
 DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x4c2__sse2_ld64)
 DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x4c2__sse2_ld64)
 DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x4c2__sse2_ld64)
diff --git a/test/qs8-gemm-minmax-fp32.cc b/test/qs8-gemm-minmax-fp32.cc
index 553a0e0..36a154b 100644
--- a/test/qs8-gemm-minmax-fp32.cc
+++ b/test/qs8-gemm-minmax-fp32.cc
@@ -23,6 +23,1830 @@
 
 
 #if XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, k_eq_4) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(4)
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, strided_cn) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(4)
+      .cn_stride(19)
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, k_eq_4_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(4)
+      .a_stride(7)
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, k_eq_4_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(4)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, k_eq_4_subtile_m) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(4)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, k_eq_4_subtile_n) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(4)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, k_lt_4) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k < 4; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, k_lt_4_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k < 4; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .a_stride(7)
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, k_lt_4_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k < 4; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, k_gt_4) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 5; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, k_gt_4_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 5; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, k_gt_4_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 5; k < 8; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, k_div_4) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 8; k <= 40; k += 4) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, k_div_4_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 8; k <= 40; k += 4) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .a_stride(43)
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, k_div_4_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 8; k <= 40; k += 4) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 20; k += 5) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(1)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 20; k += 5) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(1)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, n_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 20; k += 5) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(23)
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 20; k += 5) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, n_div_16) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 20; k += 5) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(1)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 20; k += 5) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, n_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 20; k += 5) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(23)
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 20; k += 5) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k <= 20; k += 5) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, qmin) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(4)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, qmax) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(4)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, strided_cm) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(4)
+      .cm_stride(19)
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(8)
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, strided_cn) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(8)
+      .cn_stride(19)
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, k_eq_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, k_lt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, k_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, k_div_8) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, k_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(1)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(1)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, n_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, n_div_16) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(1)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, n_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, qmin) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, qmax) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, strided_cm) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(8)
+      .cm_stride(19)
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, k_eq_4) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(4)
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, strided_cn) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(4)
+      .cn_stride(19)
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, k_eq_4_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(4)
+      .a_stride(7)
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, k_eq_4_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(4)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, k_eq_4_subtile_m) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(4)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, k_eq_4_subtile_n) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(4)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, k_lt_4) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k < 4; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, k_lt_4_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k < 4; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .a_stride(7)
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, k_lt_4_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k < 4; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, k_gt_4) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 5; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, k_gt_4_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 5; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, k_gt_4_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 5; k < 8; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, k_div_4) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 8; k <= 40; k += 4) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, k_div_4_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 8; k <= 40; k += 4) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .a_stride(43)
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, k_div_4_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 8; k <= 40; k += 4) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 20; k += 5) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 20; k += 5) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, n_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 20; k += 5) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(23)
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 20; k += 5) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, n_div_16) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 20; k += 5) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 20; k += 5) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, n_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 20; k += 5) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(23)
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 20; k += 5) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k <= 20; k += 5) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, qmin) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(4)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, qmax) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(4)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, strided_cm) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(4)
+      .cm_stride(19)
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, strided_cn) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .cn_stride(19)
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, k_eq_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, k_lt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, k_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, k_div_8) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, k_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, n_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, n_div_16) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, n_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, qmin) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, qmax) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, strided_cm) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .cm_stride(19)
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
   TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, k_eq_16) {
     TEST_REQUIRES_ARM_NEON_DOT;
     GemmMicrokernelTester()
@@ -478,6 +2302,462 @@
 #endif  // XNN_ARCH_ARM64
 
 
+#if XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(16)
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, strided_cn) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(16)
+      .cn_stride(19)
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_eq_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(16)
+      .a_stride(19)
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_lt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .a_stride(37)
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_div_16) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .a_stride(163)
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(83)
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_div_16) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(83)
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, qmin) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, qmax) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, strided_cm) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(16)
+      .cm_stride(19)
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_ARM64
+
+
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(QS8_GEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
diff --git a/test/qs8-gemm-minmax-fp32.yaml b/test/qs8-gemm-minmax-fp32.yaml
index d489b28..37cbbb0 100644
--- a/test/qs8-gemm-minmax-fp32.yaml
+++ b/test/qs8-gemm-minmax-fp32.yaml
@@ -3,9 +3,24 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+- name: xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32
+  init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
+  k-block: 4
+- name: xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64
+  init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
+  k-block: 8
+- name: xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32
+  init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
+  k-block: 4
+- name: xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64
+  init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
+  k-block: 8
 - name: xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128
   init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
   k-block: 16
+- name: xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55
+  init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
+  k-block: 16
 - name: xnn_qs8_gemm_minmax_fp32_ukernel_1x16__neon_mlal_lane
   init: xnn_init_qs8_conv_minmax_fp32_neon_params
   k-block: 8
diff --git a/test/qs8-gemm-minmax-gemmlowp.cc b/test/qs8-gemm-minmax-gemmlowp.cc
index e724e64..bf6466b 100644
--- a/test/qs8-gemm-minmax-gemmlowp.cc
+++ b/test/qs8-gemm-minmax-gemmlowp.cc
@@ -23,6 +23,2742 @@
 
 
 #if XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, k_eq_4) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(4)
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, strided_cn) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(4)
+      .cn_stride(19)
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, k_eq_4_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(4)
+      .a_stride(7)
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, k_eq_4_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(4)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, k_eq_4_subtile_m) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(4)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, k_eq_4_subtile_n) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(4)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, k_lt_4) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k < 4; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, k_lt_4_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k < 4; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .a_stride(7)
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, k_lt_4_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k < 4; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, k_gt_4) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 5; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, k_gt_4_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 5; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, k_gt_4_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 5; k < 8; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, k_div_4) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 8; k <= 40; k += 4) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, k_div_4_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 8; k <= 40; k += 4) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .a_stride(43)
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, k_div_4_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 8; k <= 40; k += 4) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 20; k += 5) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(1)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 20; k += 5) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(1)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, n_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 20; k += 5) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(23)
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 20; k += 5) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, n_div_16) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 20; k += 5) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(1)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 20; k += 5) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, n_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 20; k += 5) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(23)
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 20; k += 5) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k <= 20; k += 5) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, qmin) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(4)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, qmax) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(4)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, strided_cm) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(4)
+      .cm_stride(19)
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+#endif  // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(8)
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, strided_cn) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(8)
+      .cn_stride(19)
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, k_eq_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, k_lt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, k_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, k_div_8) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, k_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(1)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(1)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, n_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, n_div_16) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(1)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, n_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, qmin) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, qmax) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, strided_cm) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(8)
+      .cm_stride(19)
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+#endif  // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, k_eq_4) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(4)
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, strided_cn) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(4)
+      .cn_stride(19)
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, k_eq_4_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(4)
+      .a_stride(7)
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, k_eq_4_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(4)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, k_eq_4_subtile_m) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(4)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, k_eq_4_subtile_n) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(4)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, k_lt_4) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k < 4; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, k_lt_4_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k < 4; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .a_stride(7)
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, k_lt_4_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k < 4; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, k_gt_4) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 5; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, k_gt_4_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 5; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, k_gt_4_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 5; k < 8; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, k_div_4) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 8; k <= 40; k += 4) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, k_div_4_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 8; k <= 40; k += 4) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .a_stride(43)
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, k_div_4_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 8; k <= 40; k += 4) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 20; k += 5) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 20; k += 5) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, n_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 20; k += 5) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(23)
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 20; k += 5) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, n_div_16) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 20; k += 5) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 20; k += 5) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, n_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 20; k += 5) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(23)
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 20; k += 5) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k <= 20; k += 5) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, qmin) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(4)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, qmax) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(4)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, strided_cm) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(4)
+      .cm_stride(19)
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+#endif  // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, strided_cn) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .cn_stride(19)
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_eq_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_lt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_div_8) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, n_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, n_div_16) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, n_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, qmin) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, qmax) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, strided_cm) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .cm_stride(19)
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+#endif  // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(16)
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, strided_cn) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(16)
+      .cn_stride(19)
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_eq_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(16)
+      .a_stride(19)
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_lt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .a_stride(37)
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_div_16) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .a_stride(163)
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, n_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(83)
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, n_div_16) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, n_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(83)
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, qmin) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, qmax) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, strided_cm) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(16)
+      .cm_stride(19)
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+#endif  // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(16)
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, strided_cn) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(16)
+      .cn_stride(19)
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_eq_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(16)
+      .a_stride(19)
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_lt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .a_stride(37)
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_div_16) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .a_stride(163)
+        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(83)
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_div_16) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(83)
+          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, qmin) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, qmax) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, strided_cm) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(16)
+      .cm_stride(19)
+      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+#endif  // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
   TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
@@ -40150,2742 +42886,6 @@
 #endif  // XNN_ARCH_ARM64
 
 
-#if XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, k_eq_4) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(1)
-      .n(16)
-      .k(4)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, strided_cn) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(1)
-      .n(16)
-      .k(4)
-      .cn_stride(19)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, k_eq_4_strided_a) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(1)
-      .n(16)
-      .k(4)
-      .a_stride(7)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, k_eq_4_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t m = 1; m <= 1; m++) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(4)
-          .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, k_eq_4_subtile_m) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t m = 1; m <= 1; m++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(m)
-        .n(16)
-        .k(4)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, k_eq_4_subtile_n) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 1; n <= 16; n++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(1)
-        .n(n)
-        .k(4)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, k_lt_4) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 1; k < 4; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(1)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, k_lt_4_strided_a) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 1; k < 4; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(1)
-        .n(16)
-        .k(k)
-        .a_stride(7)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, k_lt_4_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 1; k < 4; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, k_gt_4) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 5; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(1)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, k_gt_4_strided_a) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 5; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(1)
-        .n(16)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, k_gt_4_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 5; k < 8; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, k_div_4) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 8; k <= 40; k += 4) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(1)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, k_div_4_strided_a) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 8; k <= 40; k += 4) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(1)
-        .n(16)
-        .k(k)
-        .a_stride(43)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, k_div_4_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 8; k <= 40; k += 4) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, n_gt_16) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 20; k += 5) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(1)
-          .n(16)
-          .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, n_gt_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 20; k += 5) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(1)
-          .n(16)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, n_gt_16_strided_a) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 20; k += 5) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .a_stride(23)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, n_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 20; k += 5) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, n_div_16) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 20; k += 5) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(1)
-          .n(16)
-          .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, n_div_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 20; k += 5) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, n_div_16_strided_a) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 20; k += 5) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .a_stride(23)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, n_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 20; k += 5) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 1; k <= 20; k += 5) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(19)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, qmin) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(1)
-      .n(16)
-      .k(4)
-      .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, qmax) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(1)
-      .n(16)
-      .k(4)
-      .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, strided_cm) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(1)
-      .n(16)
-      .k(4)
-      .cm_stride(19)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-#endif  // XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, k_eq_8) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(1)
-      .n(16)
-      .k(8)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, strided_cn) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(1)
-      .n(16)
-      .k(8)
-      .cn_stride(19)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, k_eq_8_strided_a) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(1)
-      .n(16)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, k_eq_8_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t m = 1; m <= 1; m++) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(8)
-          .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, k_eq_8_subtile_m) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t m = 1; m <= 1; m++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(m)
-        .n(16)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, k_eq_8_subtile_n) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 1; n <= 16; n++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(1)
-        .n(n)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, k_lt_8) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(1)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, k_lt_8_strided_a) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(1)
-        .n(16)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, k_lt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 1; k < 8; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, k_gt_8) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(1)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, k_gt_8_strided_a) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(1)
-        .n(16)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, k_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 9; k < 16; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, k_div_8) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(1)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, k_div_8_strided_a) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(1)
-        .n(16)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, k_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, n_gt_16) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(1)
-          .n(16)
-          .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, n_gt_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(1)
-          .n(16)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, n_gt_16_strided_a) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, n_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, n_div_16) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(1)
-          .n(16)
-          .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, n_div_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, n_div_16_strided_a) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, n_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(19)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, qmin) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(1)
-      .n(16)
-      .k(8)
-      .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, qmax) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(1)
-      .n(16)
-      .k(8)
-      .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, strided_cm) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(1)
-      .n(16)
-      .k(8)
-      .cm_stride(19)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-#endif  // XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, k_eq_4) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(4)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, strided_cn) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(4)
-      .cn_stride(19)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, k_eq_4_strided_a) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(4)
-      .a_stride(7)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, k_eq_4_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t m = 1; m <= 4; m++) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(4)
-          .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, k_eq_4_subtile_m) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t m = 1; m <= 4; m++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(m)
-        .n(16)
-        .k(4)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, k_eq_4_subtile_n) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 1; n <= 16; n++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(n)
-        .k(4)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, k_lt_4) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 1; k < 4; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, k_lt_4_strided_a) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 1; k < 4; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .a_stride(7)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, k_lt_4_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 1; k < 4; k++) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, k_gt_4) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 5; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, k_gt_4_strided_a) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 5; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, k_gt_4_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 5; k < 8; k++) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, k_div_4) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 8; k <= 40; k += 4) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, k_div_4_strided_a) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 8; k <= 40; k += 4) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .a_stride(43)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, k_div_4_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 8; k <= 40; k += 4) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, n_gt_16) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 20; k += 5) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(16)
-          .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, n_gt_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 20; k += 5) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(16)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, n_gt_16_strided_a) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 20; k += 5) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .a_stride(23)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, n_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 20; k += 5) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, n_div_16) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 20; k += 5) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(16)
-          .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, n_div_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 20; k += 5) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, n_div_16_strided_a) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 20; k += 5) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .a_stride(23)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, n_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 20; k += 5) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 1; k <= 20; k += 5) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(19)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, qmin) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(4)
-      .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, qmax) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(4)
-      .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, strided_cm) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(4)
-      .cm_stride(19)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-#endif  // XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_eq_8) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(8)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, strided_cn) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(8)
-      .cn_stride(19)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_eq_8_strided_a) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(8)
-      .a_stride(11)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_eq_8_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t m = 1; m <= 4; m++) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(8)
-          .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_eq_8_subtile_m) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t m = 1; m <= 4; m++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(m)
-        .n(16)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_eq_8_subtile_n) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 1; n <= 16; n++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(n)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_lt_8) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_lt_8_strided_a) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .a_stride(11)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_lt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 1; k < 8; k++) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_gt_8) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_gt_8_strided_a) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 9; k < 16; k++) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_div_8) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_div_8_strided_a) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .a_stride(83)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, n_gt_16) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(16)
-          .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, n_gt_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(16)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, n_gt_16_strided_a) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, n_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, n_div_16) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(16)
-          .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, n_div_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, n_div_16_strided_a) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .a_stride(43)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, n_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(19)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, qmin) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(8)
-      .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, qmax) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(8)
-      .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, strided_cm) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(8)
-      .cm_stride(19)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-#endif  // XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_eq_16) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(16)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, strided_cn) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(16)
-      .cn_stride(19)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_eq_16_strided_a) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(16)
-      .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_eq_16_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t m = 1; m <= 4; m++) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(16)
-          .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_eq_16_subtile_m) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t m = 1; m <= 4; m++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(m)
-        .n(16)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_eq_16_subtile_n) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 1; n <= 16; n++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(n)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_lt_16) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_lt_16_strided_a) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_lt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 1; k < 16; k++) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_gt_16) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_gt_16_strided_a) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 17; k < 32; k++) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_div_16) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_div_16_strided_a) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, n_gt_16) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(16)
-          .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, n_gt_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(16)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, n_gt_16_strided_a) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, n_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, n_div_16) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(16)
-          .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, n_div_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, n_div_16_strided_a) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, n_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(19)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, qmin) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(16)
-      .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, qmax) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(16)
-      .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, strided_cm) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(16)
-      .cm_stride(19)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-#endif  // XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM64
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_eq_16) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(16)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, strided_cn) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(16)
-      .cn_stride(19)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_eq_16_strided_a) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(16)
-      .a_stride(19)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_eq_16_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t m = 1; m <= 4; m++) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(16)
-          .iterations(1)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_eq_16_subtile_m) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t m = 1; m <= 4; m++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(m)
-        .n(16)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_eq_16_subtile_n) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 1; n <= 16; n++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(n)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_lt_16) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_lt_16_strided_a) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .a_stride(19)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_lt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 1; k < 16; k++) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_gt_16) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_gt_16_strided_a) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .a_stride(37)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 17; k < 32; k++) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_div_16) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_div_16_strided_a) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .a_stride(163)
-        .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_gt_16) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(16)
-          .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_gt_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(16)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_gt_16_strided_a) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_div_16) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(16)
-          .k(k)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_div_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_div_16_strided_a) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .a_stride(83)
-          .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(19)
-            .iterations(1)
-            .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, qmin) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(16)
-      .qmin(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, qmax) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(16)
-      .qmax(128)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, strided_cm) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(16)
-      .cm_stride(19)
-      .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-#endif  // XNN_ARCH_ARM64
-
-
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
   TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X4C2__SSE2_LD64, k_eq_8) {
     TEST_REQUIRES_X86_SSE2;
diff --git a/test/qs8-gemm-minmax-gemmlowp.yaml b/test/qs8-gemm-minmax-gemmlowp.yaml
index a700b01..878e34a 100644
--- a/test/qs8-gemm-minmax-gemmlowp.yaml
+++ b/test/qs8-gemm-minmax-gemmlowp.yaml
@@ -3,6 +3,24 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32
+  init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
+  k-block: 4
+- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64
+  init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
+  k-block: 8
+- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32
+  init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
+  k-block: 4
+- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64
+  init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
+  k-block: 8
+- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128
+  init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
+  k-block: 16
+- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55
+  init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
+  k-block: 16
 - name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 8
@@ -267,24 +285,6 @@
 - name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 16
-- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32
-  init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
-  k-block: 4
-- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64
-  init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
-  k-block: 8
-- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32
-  init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
-  k-block: 4
-- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64
-  init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
-  k-block: 8
-- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128
-  init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
-  k-block: 16
-- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55
-  init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
-  k-block: 16
 - name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x4c2__sse2_ld64
   init: xnn_init_qs8_conv_minmax_gemmlowp_sse2_params
   k-block: 8
diff --git a/test/qs8-igemm-minmax-fp32.cc b/test/qs8-igemm-minmax-fp32.cc
index 8f6b855..4db5f4e 100644
--- a/test/qs8-igemm-minmax-fp32.cc
+++ b/test/qs8-igemm-minmax-fp32.cc
@@ -22,6 +22,1410 @@
 #include "gemm-microkernel-tester.h"
 
 
+#if XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(16)
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, strided_cn) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(16)
+      .cn_stride(19)
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_div_16) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_div_16) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, small_kernel) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_gt_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_div_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, a_offset) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .a_offset(331)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, zero) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t mz = 0; mz < 4; mz++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .a_offset(331)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, qmin) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, qmax) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, strided_cm) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(16)
+      .cm_stride(19)
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, strided_cn) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .cn_stride(19)
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, k_div_8) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, n_div_16) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, small_kernel) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, n_gt_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, n_div_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, a_offset) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .a_offset(163)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, zero) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t mz = 0; mz < 4; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .a_offset(163)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, qmin) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, qmax) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, strided_cm) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .cm_stride(19)
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(16)
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, strided_cn) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(16)
+      .cn_stride(19)
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, k_div_16) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, n_div_16) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, small_kernel) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, n_gt_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, n_div_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, a_offset) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .a_offset(331)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, zero) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t mz = 0; mz < 4; mz++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .a_offset(331)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, qmin) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, qmax) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, strided_cm) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(16)
+      .cm_stride(19)
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_ARM64
+
+
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(QS8_IGEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
diff --git a/test/qs8-igemm-minmax-fp32.yaml b/test/qs8-igemm-minmax-fp32.yaml
index 24f7599..7861808 100644
--- a/test/qs8-igemm-minmax-fp32.yaml
+++ b/test/qs8-igemm-minmax-fp32.yaml
@@ -3,6 +3,15 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55
+  init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
+  k-block: 16
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64
+  init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
+  k-block: 8
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128
+  init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
+  k-block: 16
 - name: xnn_qs8_igemm_minmax_fp32_ukernel_1x16__neon_mlal_lane
   init: xnn_init_qs8_conv_minmax_fp32_neon_params
   k-block: 8
diff --git a/test/qs8-igemm-minmax-gemmlowp.cc b/test/qs8-igemm-minmax-gemmlowp.cc
index 590d5e9..fb8862e 100644
--- a/test/qs8-igemm-minmax-gemmlowp.cc
+++ b/test/qs8-igemm-minmax-gemmlowp.cc
@@ -491,6 +491,942 @@
 
 
 #if XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, strided_cn) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .cn_stride(19)
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_div_8) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, n_div_16) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, small_kernel) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, n_gt_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, n_div_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, a_offset) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .a_offset(163)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, zero) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t mz = 0; mz < 4; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .a_offset(163)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, qmin) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, qmax) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, strided_cm) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .cm_stride(19)
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+#endif  // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(16)
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, strided_cn) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(16)
+      .cn_stride(19)
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_div_16) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, n_div_16) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, small_kernel) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, n_gt_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, n_div_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, a_offset) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .a_offset(331)
+        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, zero) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t mz = 0; mz < 4; mz++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .a_offset(331)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, qmin) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, qmax) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, strided_cm) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(16)
+      .cm_stride(19)
+      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+  }
+#endif  // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
   TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
@@ -41206,942 +42142,6 @@
 #endif  // XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64
 
 
-#if XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_eq_8) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(8)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, strided_cn) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(8)
-      .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_eq_8_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t m = 1; m <= 4; m++) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(8)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_eq_8_subtile_m) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t m = 1; m <= 4; m++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(m)
-        .n(16)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_eq_8_subtile_n) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 1; n <= 16; n++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(n)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_lt_8) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_lt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 1; k < 8; k++) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_gt_8) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 9; k < 16; k++) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_div_8) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, n_gt_16) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(16)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, n_gt_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(16)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, n_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, n_div_16) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(16)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, n_div_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, n_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, small_kernel) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 1; k <= 40; k += 9) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, n_gt_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, n_div_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(19)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, a_offset) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 1; k <= 40; k += 9) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, zero) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t mz = 0; mz < 4; mz++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .a_offset(163)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, qmin) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(8)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, qmax) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(8)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, strided_cm) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(8)
-      .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-#endif  // XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_eq_16) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(16)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, strided_cn) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(16)
-      .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_eq_16_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t m = 1; m <= 4; m++) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(16)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_eq_16_subtile_m) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t m = 1; m <= 4; m++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(m)
-        .n(16)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_eq_16_subtile_n) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 1; n <= 16; n++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(n)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_lt_16) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_lt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 1; k < 16; k++) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_gt_16) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 17; k < 32; k++) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_div_16) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, n_gt_16) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(16)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, n_gt_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(16)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, n_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, n_div_16) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(16)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, n_div_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, n_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, small_kernel) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, n_gt_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, n_div_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(19)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, a_offset) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .a_offset(331)
-        .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, zero) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t mz = 0; mz < 4; mz++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .a_offset(331)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, qmin) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(16)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, qmax) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(16)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, strided_cm) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(16)
-      .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
-  }
-#endif  // XNN_ARCH_ARM64
-
-
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
   TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X4C2__SSE2_LD64, k_eq_8) {
     TEST_REQUIRES_X86_SSE2;
diff --git a/test/qs8-igemm-minmax-gemmlowp.yaml b/test/qs8-igemm-minmax-gemmlowp.yaml
index 86c7eb5..348ae0e 100644
--- a/test/qs8-igemm-minmax-gemmlowp.yaml
+++ b/test/qs8-igemm-minmax-gemmlowp.yaml
@@ -6,6 +6,12 @@
 - name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 16
+- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64
+  init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
+  k-block: 8
+- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128
+  init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
+  k-block: 16
 - name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 8
@@ -267,12 +273,6 @@
 - name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_8x16c4__neondot
   init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
   k-block: 8
-- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64
-  init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
-  k-block: 8
-- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128
-  init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
-  k-block: 16
 - name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x4c2__sse2_ld64
   init: xnn_init_qs8_conv_minmax_gemmlowp_sse2_params
   k-block: 8