Re-label branch targets in c4-neondot assembly QS8 GEMM microkernels.
Sort build filenames numerically

PiperOrigin-RevId: 360476768
diff --git a/BUILD.bazel b/BUILD.bazel
index b20035e..56a359f 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -1699,97 +1699,97 @@
     "src/qs8-gemm/gen/1x8c2-minmax-neon-mlal-padal-dup.c",
     "src/qs8-gemm/gen/1x8c2-minmax-neon-mull-padal-dup.c",
     "src/qs8-gemm/gen/1x8c8-minmax-neon-mull-padal.c",
+    "src/qs8-gemm/gen/1x8c16-minmax-neon-mlal-padal.c",
     "src/qs8-gemm/gen/1x16-minmax-neon-mlal-lane.c",
     "src/qs8-gemm/gen/1x16-minmax-neon-mull-addw-dup.c",
     "src/qs8-gemm/gen/1x16c2-minmax-neon-mlal-padal-dup.c",
     "src/qs8-gemm/gen/1x16c2-minmax-neon-mull-padal-dup.c",
     "src/qs8-gemm/gen/1x16c8-minmax-neon-mull-padal.c",
+    "src/qs8-gemm/gen/1x16c16-minmax-neon-mlal-padal.c",
     "src/qs8-gemm/gen/2x8-minmax-neon-mlal-lane.c",
     "src/qs8-gemm/gen/2x8-minmax-neon-mull-addw-dup.c",
     "src/qs8-gemm/gen/2x8c2-minmax-neon-mlal-padal-dup.c",
     "src/qs8-gemm/gen/2x8c2-minmax-neon-mull-padal-dup.c",
     "src/qs8-gemm/gen/2x8c8-minmax-neon-mull-padal.c",
+    "src/qs8-gemm/gen/2x8c16-minmax-neon-mlal-padal.c",
     "src/qs8-gemm/gen/2x16-minmax-neon-mlal-lane.c",
     "src/qs8-gemm/gen/2x16-minmax-neon-mull-addw-dup.c",
     "src/qs8-gemm/gen/2x16c2-minmax-neon-mlal-padal-dup.c",
     "src/qs8-gemm/gen/2x16c2-minmax-neon-mull-padal-dup.c",
     "src/qs8-gemm/gen/2x16c8-minmax-neon-mull-padal.c",
+    "src/qs8-gemm/gen/2x16c16-minmax-neon-mlal-padal.c",
     "src/qs8-gemm/gen/3x8-minmax-neon-mlal-lane.c",
     "src/qs8-gemm/gen/3x8-minmax-neon-mull-addw-dup.c",
     "src/qs8-gemm/gen/3x8c2-minmax-neon-mlal-padal-dup.c",
     "src/qs8-gemm/gen/3x8c2-minmax-neon-mull-padal-dup.c",
     "src/qs8-gemm/gen/3x8c8-minmax-neon-mull-padal.c",
+    "src/qs8-gemm/gen/3x8c16-minmax-neon-mlal-padal.c",
     "src/qs8-gemm/gen/3x16-minmax-neon-mlal-lane.c",
     "src/qs8-gemm/gen/3x16-minmax-neon-mull-addw-dup.c",
     "src/qs8-gemm/gen/3x16c2-minmax-neon-mlal-padal-dup.c",
     "src/qs8-gemm/gen/3x16c2-minmax-neon-mull-padal-dup.c",
     "src/qs8-gemm/gen/3x16c8-minmax-neon-mull-padal.c",
+    "src/qs8-gemm/gen/3x16c16-minmax-neon-mlal-padal.c",
     "src/qs8-gemm/gen/4x8-minmax-neon-mlal-lane.c",
     "src/qs8-gemm/gen/4x8-minmax-neon-mull-addw-dup.c",
     "src/qs8-gemm/gen/4x8c2-minmax-neon-mlal-padal-dup.c",
     "src/qs8-gemm/gen/4x8c2-minmax-neon-mull-padal-dup.c",
     "src/qs8-gemm/gen/4x8c8-minmax-neon-mull-padal.c",
+    "src/qs8-gemm/gen/4x8c16-minmax-neon-mlal-padal.c",
     "src/qs8-gemm/gen/4x16-minmax-neon-mlal-lane.c",
     "src/qs8-gemm/gen/4x16-minmax-neon-mull-addw-dup.c",
     "src/qs8-gemm/gen/4x16c2-minmax-neon-mlal-padal-dup.c",
     "src/qs8-gemm/gen/4x16c2-minmax-neon-mull-padal-dup.c",
     "src/qs8-gemm/gen/4x16c8-minmax-neon-mull-padal.c",
-    "src/qs8-gemm/gen/1x8c16-minmax-neon-mlal-padal.c",
-    "src/qs8-gemm/gen/2x8c16-minmax-neon-mlal-padal.c",
-    "src/qs8-gemm/gen/3x8c16-minmax-neon-mlal-padal.c",
-    "src/qs8-gemm/gen/4x8c16-minmax-neon-mlal-padal.c",
-    "src/qs8-gemm/gen/1x16c16-minmax-neon-mlal-padal.c",
-    "src/qs8-gemm/gen/2x16c16-minmax-neon-mlal-padal.c",
-    "src/qs8-gemm/gen/3x16c16-minmax-neon-mlal-padal.c",
     "src/qs8-gemm/gen/4x16c16-minmax-neon-mlal-padal.c",
     "src/qs8-igemm/gen/1x8-minmax-neon-mlal-lane.c",
-    "src/qs8-igemm/gen/1x16-minmax-neon-mlal-lane.c",
-    "src/qs8-igemm/gen/2x8-minmax-neon-mlal-lane.c",
-    "src/qs8-igemm/gen/2x16-minmax-neon-mlal-lane.c",
-    "src/qs8-igemm/gen/3x8-minmax-neon-mlal-lane.c",
-    "src/qs8-igemm/gen/3x16-minmax-neon-mlal-lane.c",
-    "src/qs8-igemm/gen/4x8-minmax-neon-mlal-lane.c",
-    "src/qs8-igemm/gen/4x16-minmax-neon-mlal-lane.c",
-    "src/qs8-igemm/gen/1x16c8-minmax-neon-mull-padal.c",
-    "src/qs8-igemm/gen/1x8c8-minmax-neon-mull-padal.c",
-    "src/qs8-igemm/gen/2x16c8-minmax-neon-mull-padal.c",
-    "src/qs8-igemm/gen/2x8c8-minmax-neon-mull-padal.c",
-    "src/qs8-igemm/gen/3x16c8-minmax-neon-mull-padal.c",
-    "src/qs8-igemm/gen/3x8c8-minmax-neon-mull-padal.c",
-    "src/qs8-igemm/gen/4x16c8-minmax-neon-mull-padal.c",
-    "src/qs8-igemm/gen/4x8c8-minmax-neon-mull-padal.c",
-    "src/qs8-igemm/gen/1x16c16-minmax-neon-mlal-padal.c",
-    "src/qs8-igemm/gen/1x8c16-minmax-neon-mlal-padal.c",
-    "src/qs8-igemm/gen/2x16c16-minmax-neon-mlal-padal.c",
-    "src/qs8-igemm/gen/2x8c16-minmax-neon-mlal-padal.c",
-    "src/qs8-igemm/gen/3x16c16-minmax-neon-mlal-padal.c",
-    "src/qs8-igemm/gen/3x8c16-minmax-neon-mlal-padal.c",
-    "src/qs8-igemm/gen/4x16c16-minmax-neon-mlal-padal.c",
-    "src/qs8-igemm/gen/4x8c16-minmax-neon-mlal-padal.c",
-    "src/qs8-igemm/gen/1x16c2-minmax-neon-mlal-padal-dup.c",
-    "src/qs8-igemm/gen/1x8c2-minmax-neon-mlal-padal-dup.c",
-    "src/qs8-igemm/gen/2x16c2-minmax-neon-mlal-padal-dup.c",
-    "src/qs8-igemm/gen/2x8c2-minmax-neon-mlal-padal-dup.c",
-    "src/qs8-igemm/gen/3x16c2-minmax-neon-mlal-padal-dup.c",
-    "src/qs8-igemm/gen/3x8c2-minmax-neon-mlal-padal-dup.c",
-    "src/qs8-igemm/gen/4x16c2-minmax-neon-mlal-padal-dup.c",
-    "src/qs8-igemm/gen/4x8c2-minmax-neon-mlal-padal-dup.c",
-    "src/qs8-igemm/gen/1x16c2-minmax-neon-mull-padal-dup.c",
-    "src/qs8-igemm/gen/1x8c2-minmax-neon-mull-padal-dup.c",
-    "src/qs8-igemm/gen/2x16c2-minmax-neon-mull-padal-dup.c",
-    "src/qs8-igemm/gen/2x8c2-minmax-neon-mull-padal-dup.c",
-    "src/qs8-igemm/gen/3x16c2-minmax-neon-mull-padal-dup.c",
-    "src/qs8-igemm/gen/3x8c2-minmax-neon-mull-padal-dup.c",
-    "src/qs8-igemm/gen/4x16c2-minmax-neon-mull-padal-dup.c",
-    "src/qs8-igemm/gen/4x8c2-minmax-neon-mull-padal-dup.c",
-    "src/qs8-igemm/gen/1x16-minmax-neon-mull-addw-dup.c",
     "src/qs8-igemm/gen/1x8-minmax-neon-mull-addw-dup.c",
-    "src/qs8-igemm/gen/2x16-minmax-neon-mull-addw-dup.c",
+    "src/qs8-igemm/gen/1x8c2-minmax-neon-mlal-padal-dup.c",
+    "src/qs8-igemm/gen/1x8c2-minmax-neon-mull-padal-dup.c",
+    "src/qs8-igemm/gen/1x8c8-minmax-neon-mull-padal.c",
+    "src/qs8-igemm/gen/1x8c16-minmax-neon-mlal-padal.c",
+    "src/qs8-igemm/gen/1x16-minmax-neon-mlal-lane.c",
+    "src/qs8-igemm/gen/1x16-minmax-neon-mull-addw-dup.c",
+    "src/qs8-igemm/gen/1x16c2-minmax-neon-mlal-padal-dup.c",
+    "src/qs8-igemm/gen/1x16c2-minmax-neon-mull-padal-dup.c",
+    "src/qs8-igemm/gen/1x16c8-minmax-neon-mull-padal.c",
+    "src/qs8-igemm/gen/1x16c16-minmax-neon-mlal-padal.c",
+    "src/qs8-igemm/gen/2x8-minmax-neon-mlal-lane.c",
     "src/qs8-igemm/gen/2x8-minmax-neon-mull-addw-dup.c",
-    "src/qs8-igemm/gen/3x16-minmax-neon-mull-addw-dup.c",
+    "src/qs8-igemm/gen/2x8c2-minmax-neon-mlal-padal-dup.c",
+    "src/qs8-igemm/gen/2x8c2-minmax-neon-mull-padal-dup.c",
+    "src/qs8-igemm/gen/2x8c8-minmax-neon-mull-padal.c",
+    "src/qs8-igemm/gen/2x8c16-minmax-neon-mlal-padal.c",
+    "src/qs8-igemm/gen/2x16-minmax-neon-mlal-lane.c",
+    "src/qs8-igemm/gen/2x16-minmax-neon-mull-addw-dup.c",
+    "src/qs8-igemm/gen/2x16c2-minmax-neon-mlal-padal-dup.c",
+    "src/qs8-igemm/gen/2x16c2-minmax-neon-mull-padal-dup.c",
+    "src/qs8-igemm/gen/2x16c8-minmax-neon-mull-padal.c",
+    "src/qs8-igemm/gen/2x16c16-minmax-neon-mlal-padal.c",
+    "src/qs8-igemm/gen/3x8-minmax-neon-mlal-lane.c",
     "src/qs8-igemm/gen/3x8-minmax-neon-mull-addw-dup.c",
-    "src/qs8-igemm/gen/4x16-minmax-neon-mull-addw-dup.c",
+    "src/qs8-igemm/gen/3x8c2-minmax-neon-mlal-padal-dup.c",
+    "src/qs8-igemm/gen/3x8c2-minmax-neon-mull-padal-dup.c",
+    "src/qs8-igemm/gen/3x8c8-minmax-neon-mull-padal.c",
+    "src/qs8-igemm/gen/3x8c16-minmax-neon-mlal-padal.c",
+    "src/qs8-igemm/gen/3x16-minmax-neon-mlal-lane.c",
+    "src/qs8-igemm/gen/3x16-minmax-neon-mull-addw-dup.c",
+    "src/qs8-igemm/gen/3x16c2-minmax-neon-mlal-padal-dup.c",
+    "src/qs8-igemm/gen/3x16c2-minmax-neon-mull-padal-dup.c",
+    "src/qs8-igemm/gen/3x16c8-minmax-neon-mull-padal.c",
+    "src/qs8-igemm/gen/3x16c16-minmax-neon-mlal-padal.c",
+    "src/qs8-igemm/gen/4x8-minmax-neon-mlal-lane.c",
     "src/qs8-igemm/gen/4x8-minmax-neon-mull-addw-dup.c",
+    "src/qs8-igemm/gen/4x8c2-minmax-neon-mlal-padal-dup.c",
+    "src/qs8-igemm/gen/4x8c2-minmax-neon-mull-padal-dup.c",
+    "src/qs8-igemm/gen/4x8c8-minmax-neon-mull-padal.c",
+    "src/qs8-igemm/gen/4x8c16-minmax-neon-mlal-padal.c",
+    "src/qs8-igemm/gen/4x16-minmax-neon-mlal-lane.c",
+    "src/qs8-igemm/gen/4x16-minmax-neon-mull-addw-dup.c",
+    "src/qs8-igemm/gen/4x16c2-minmax-neon-mlal-padal-dup.c",
+    "src/qs8-igemm/gen/4x16c2-minmax-neon-mull-padal-dup.c",
+    "src/qs8-igemm/gen/4x16c8-minmax-neon-mull-padal.c",
+    "src/qs8-igemm/gen/4x16c16-minmax-neon-mlal-padal.c",
     "src/qs8-requantization/fp32-neon.c",
     "src/qs8-requantization/precise-neon.c",
     "src/qs8-requantization/q31-neon.c",
@@ -3551,8 +3551,8 @@
     "src/qs8-gemm/1x16c4-aarch64-neondot-ld32.S",
     "src/qs8-gemm/1x16c4-aarch64-neondot-ld64.S",
     "src/qs8-gemm/4x16c4-aarch64-neondot-cortex-a55.S",
-    "src/qs8-gemm/4x16c4-aarch64-neondot-ld64.S",
     "src/qs8-gemm/4x16c4-aarch64-neondot-ld32.S",
+    "src/qs8-gemm/4x16c4-aarch64-neondot-ld64.S",
 ]
 
 INTERNAL_MICROKERNEL_HDRS = [
diff --git a/CMakeLists.txt b/CMakeLists.txt
index fb1252c..1d0f2b3 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -948,97 +948,97 @@
   src/qs8-gemm/gen/1x8c2-minmax-neon-mlal-padal-dup.c
   src/qs8-gemm/gen/1x8c2-minmax-neon-mull-padal-dup.c
   src/qs8-gemm/gen/1x8c8-minmax-neon-mull-padal.c
+  src/qs8-gemm/gen/1x8c16-minmax-neon-mlal-padal.c
   src/qs8-gemm/gen/1x16-minmax-neon-mlal-lane.c
   src/qs8-gemm/gen/1x16-minmax-neon-mull-addw-dup.c
   src/qs8-gemm/gen/1x16c2-minmax-neon-mlal-padal-dup.c
   src/qs8-gemm/gen/1x16c2-minmax-neon-mull-padal-dup.c
   src/qs8-gemm/gen/1x16c8-minmax-neon-mull-padal.c
+  src/qs8-gemm/gen/1x16c16-minmax-neon-mlal-padal.c
   src/qs8-gemm/gen/2x8-minmax-neon-mlal-lane.c
   src/qs8-gemm/gen/2x8-minmax-neon-mull-addw-dup.c
   src/qs8-gemm/gen/2x8c2-minmax-neon-mlal-padal-dup.c
   src/qs8-gemm/gen/2x8c2-minmax-neon-mull-padal-dup.c
   src/qs8-gemm/gen/2x8c8-minmax-neon-mull-padal.c
+  src/qs8-gemm/gen/2x8c16-minmax-neon-mlal-padal.c
   src/qs8-gemm/gen/2x16-minmax-neon-mlal-lane.c
   src/qs8-gemm/gen/2x16-minmax-neon-mull-addw-dup.c
   src/qs8-gemm/gen/2x16c2-minmax-neon-mlal-padal-dup.c
   src/qs8-gemm/gen/2x16c2-minmax-neon-mull-padal-dup.c
   src/qs8-gemm/gen/2x16c8-minmax-neon-mull-padal.c
+  src/qs8-gemm/gen/2x16c16-minmax-neon-mlal-padal.c
   src/qs8-gemm/gen/3x8-minmax-neon-mlal-lane.c
   src/qs8-gemm/gen/3x8-minmax-neon-mull-addw-dup.c
   src/qs8-gemm/gen/3x8c2-minmax-neon-mlal-padal-dup.c
   src/qs8-gemm/gen/3x8c2-minmax-neon-mull-padal-dup.c
   src/qs8-gemm/gen/3x8c8-minmax-neon-mull-padal.c
+  src/qs8-gemm/gen/3x8c16-minmax-neon-mlal-padal.c
   src/qs8-gemm/gen/3x16-minmax-neon-mlal-lane.c
   src/qs8-gemm/gen/3x16-minmax-neon-mull-addw-dup.c
   src/qs8-gemm/gen/3x16c2-minmax-neon-mlal-padal-dup.c
   src/qs8-gemm/gen/3x16c2-minmax-neon-mull-padal-dup.c
   src/qs8-gemm/gen/3x16c8-minmax-neon-mull-padal.c
+  src/qs8-gemm/gen/3x16c16-minmax-neon-mlal-padal.c
   src/qs8-gemm/gen/4x8-minmax-neon-mlal-lane.c
   src/qs8-gemm/gen/4x8-minmax-neon-mull-addw-dup.c
   src/qs8-gemm/gen/4x8c2-minmax-neon-mlal-padal-dup.c
   src/qs8-gemm/gen/4x8c2-minmax-neon-mull-padal-dup.c
   src/qs8-gemm/gen/4x8c8-minmax-neon-mull-padal.c
+  src/qs8-gemm/gen/4x8c16-minmax-neon-mlal-padal.c
   src/qs8-gemm/gen/4x16-minmax-neon-mlal-lane.c
   src/qs8-gemm/gen/4x16-minmax-neon-mull-addw-dup.c
   src/qs8-gemm/gen/4x16c2-minmax-neon-mlal-padal-dup.c
   src/qs8-gemm/gen/4x16c2-minmax-neon-mull-padal-dup.c
   src/qs8-gemm/gen/4x16c8-minmax-neon-mull-padal.c
-  src/qs8-gemm/gen/1x8c16-minmax-neon-mlal-padal.c
-  src/qs8-gemm/gen/2x8c16-minmax-neon-mlal-padal.c
-  src/qs8-gemm/gen/3x8c16-minmax-neon-mlal-padal.c
-  src/qs8-gemm/gen/4x8c16-minmax-neon-mlal-padal.c
-  src/qs8-gemm/gen/1x16c16-minmax-neon-mlal-padal.c
-  src/qs8-gemm/gen/2x16c16-minmax-neon-mlal-padal.c
-  src/qs8-gemm/gen/3x16c16-minmax-neon-mlal-padal.c
   src/qs8-gemm/gen/4x16c16-minmax-neon-mlal-padal.c
   src/qs8-igemm/gen/1x8-minmax-neon-mlal-lane.c
-  src/qs8-igemm/gen/1x16-minmax-neon-mlal-lane.c
-  src/qs8-igemm/gen/2x8-minmax-neon-mlal-lane.c
-  src/qs8-igemm/gen/2x16-minmax-neon-mlal-lane.c
-  src/qs8-igemm/gen/3x8-minmax-neon-mlal-lane.c
-  src/qs8-igemm/gen/3x16-minmax-neon-mlal-lane.c
-  src/qs8-igemm/gen/4x8-minmax-neon-mlal-lane.c
-  src/qs8-igemm/gen/4x16-minmax-neon-mlal-lane.c
-  src/qs8-igemm/gen/1x16c8-minmax-neon-mull-padal.c
-  src/qs8-igemm/gen/1x8c8-minmax-neon-mull-padal.c
-  src/qs8-igemm/gen/2x16c8-minmax-neon-mull-padal.c
-  src/qs8-igemm/gen/2x8c8-minmax-neon-mull-padal.c
-  src/qs8-igemm/gen/3x16c8-minmax-neon-mull-padal.c
-  src/qs8-igemm/gen/3x8c8-minmax-neon-mull-padal.c
-  src/qs8-igemm/gen/4x16c8-minmax-neon-mull-padal.c
-  src/qs8-igemm/gen/4x8c8-minmax-neon-mull-padal.c
-  src/qs8-igemm/gen/1x16c16-minmax-neon-mlal-padal.c
-  src/qs8-igemm/gen/1x8c16-minmax-neon-mlal-padal.c
-  src/qs8-igemm/gen/2x16c16-minmax-neon-mlal-padal.c
-  src/qs8-igemm/gen/2x8c16-minmax-neon-mlal-padal.c
-  src/qs8-igemm/gen/3x16c16-minmax-neon-mlal-padal.c
-  src/qs8-igemm/gen/3x8c16-minmax-neon-mlal-padal.c
-  src/qs8-igemm/gen/4x16c16-minmax-neon-mlal-padal.c
-  src/qs8-igemm/gen/4x8c16-minmax-neon-mlal-padal.c
-  src/qs8-igemm/gen/1x16c2-minmax-neon-mlal-padal-dup.c
-  src/qs8-igemm/gen/1x8c2-minmax-neon-mlal-padal-dup.c
-  src/qs8-igemm/gen/2x16c2-minmax-neon-mlal-padal-dup.c
-  src/qs8-igemm/gen/2x8c2-minmax-neon-mlal-padal-dup.c
-  src/qs8-igemm/gen/3x16c2-minmax-neon-mlal-padal-dup.c
-  src/qs8-igemm/gen/3x8c2-minmax-neon-mlal-padal-dup.c
-  src/qs8-igemm/gen/4x16c2-minmax-neon-mlal-padal-dup.c
-  src/qs8-igemm/gen/4x8c2-minmax-neon-mlal-padal-dup.c
-  src/qs8-igemm/gen/1x16c2-minmax-neon-mull-padal-dup.c
-  src/qs8-igemm/gen/1x8c2-minmax-neon-mull-padal-dup.c
-  src/qs8-igemm/gen/2x16c2-minmax-neon-mull-padal-dup.c
-  src/qs8-igemm/gen/2x8c2-minmax-neon-mull-padal-dup.c
-  src/qs8-igemm/gen/3x16c2-minmax-neon-mull-padal-dup.c
-  src/qs8-igemm/gen/3x8c2-minmax-neon-mull-padal-dup.c
-  src/qs8-igemm/gen/4x16c2-minmax-neon-mull-padal-dup.c
-  src/qs8-igemm/gen/4x8c2-minmax-neon-mull-padal-dup.c
-  src/qs8-igemm/gen/1x16-minmax-neon-mull-addw-dup.c
   src/qs8-igemm/gen/1x8-minmax-neon-mull-addw-dup.c
-  src/qs8-igemm/gen/2x16-minmax-neon-mull-addw-dup.c
+  src/qs8-igemm/gen/1x8c2-minmax-neon-mlal-padal-dup.c
+  src/qs8-igemm/gen/1x8c2-minmax-neon-mull-padal-dup.c
+  src/qs8-igemm/gen/1x8c8-minmax-neon-mull-padal.c
+  src/qs8-igemm/gen/1x8c16-minmax-neon-mlal-padal.c
+  src/qs8-igemm/gen/1x16-minmax-neon-mlal-lane.c
+  src/qs8-igemm/gen/1x16-minmax-neon-mull-addw-dup.c
+  src/qs8-igemm/gen/1x16c2-minmax-neon-mlal-padal-dup.c
+  src/qs8-igemm/gen/1x16c2-minmax-neon-mull-padal-dup.c
+  src/qs8-igemm/gen/1x16c8-minmax-neon-mull-padal.c
+  src/qs8-igemm/gen/1x16c16-minmax-neon-mlal-padal.c
+  src/qs8-igemm/gen/2x8-minmax-neon-mlal-lane.c
   src/qs8-igemm/gen/2x8-minmax-neon-mull-addw-dup.c
-  src/qs8-igemm/gen/3x16-minmax-neon-mull-addw-dup.c
+  src/qs8-igemm/gen/2x8c2-minmax-neon-mlal-padal-dup.c
+  src/qs8-igemm/gen/2x8c2-minmax-neon-mull-padal-dup.c
+  src/qs8-igemm/gen/2x8c8-minmax-neon-mull-padal.c
+  src/qs8-igemm/gen/2x8c16-minmax-neon-mlal-padal.c
+  src/qs8-igemm/gen/2x16-minmax-neon-mlal-lane.c
+  src/qs8-igemm/gen/2x16-minmax-neon-mull-addw-dup.c
+  src/qs8-igemm/gen/2x16c2-minmax-neon-mlal-padal-dup.c
+  src/qs8-igemm/gen/2x16c2-minmax-neon-mull-padal-dup.c
+  src/qs8-igemm/gen/2x16c8-minmax-neon-mull-padal.c
+  src/qs8-igemm/gen/2x16c16-minmax-neon-mlal-padal.c
+  src/qs8-igemm/gen/3x8-minmax-neon-mlal-lane.c
   src/qs8-igemm/gen/3x8-minmax-neon-mull-addw-dup.c
-  src/qs8-igemm/gen/4x16-minmax-neon-mull-addw-dup.c
+  src/qs8-igemm/gen/3x8c2-minmax-neon-mlal-padal-dup.c
+  src/qs8-igemm/gen/3x8c2-minmax-neon-mull-padal-dup.c
+  src/qs8-igemm/gen/3x8c8-minmax-neon-mull-padal.c
+  src/qs8-igemm/gen/3x8c16-minmax-neon-mlal-padal.c
+  src/qs8-igemm/gen/3x16-minmax-neon-mlal-lane.c
+  src/qs8-igemm/gen/3x16-minmax-neon-mull-addw-dup.c
+  src/qs8-igemm/gen/3x16c2-minmax-neon-mlal-padal-dup.c
+  src/qs8-igemm/gen/3x16c2-minmax-neon-mull-padal-dup.c
+  src/qs8-igemm/gen/3x16c8-minmax-neon-mull-padal.c
+  src/qs8-igemm/gen/3x16c16-minmax-neon-mlal-padal.c
+  src/qs8-igemm/gen/4x8-minmax-neon-mlal-lane.c
   src/qs8-igemm/gen/4x8-minmax-neon-mull-addw-dup.c
+  src/qs8-igemm/gen/4x8c2-minmax-neon-mlal-padal-dup.c
+  src/qs8-igemm/gen/4x8c2-minmax-neon-mull-padal-dup.c
+  src/qs8-igemm/gen/4x8c8-minmax-neon-mull-padal.c
+  src/qs8-igemm/gen/4x8c16-minmax-neon-mlal-padal.c
+  src/qs8-igemm/gen/4x16-minmax-neon-mlal-lane.c
+  src/qs8-igemm/gen/4x16-minmax-neon-mull-addw-dup.c
+  src/qs8-igemm/gen/4x16c2-minmax-neon-mlal-padal-dup.c
+  src/qs8-igemm/gen/4x16c2-minmax-neon-mull-padal-dup.c
+  src/qs8-igemm/gen/4x16c8-minmax-neon-mull-padal.c
+  src/qs8-igemm/gen/4x16c16-minmax-neon-mlal-padal.c
   src/qs8-requantization/fp32-neon.c
   src/qs8-requantization/precise-neon.c
   src/qs8-requantization/q31-neon.c
diff --git a/src/qs8-gemm/1x16c4-aarch64-neondot-ld32.S b/src/qs8-gemm/1x16c4-aarch64-neondot-ld32.S
index e369354..4042e75 100644
--- a/src/qs8-gemm/1x16c4-aarch64-neondot-ld32.S
+++ b/src/qs8-gemm/1x16c4-aarch64-neondot-ld32.S
@@ -83,7 +83,7 @@
         SUBS    x1, x1, 16
         SMAX    v4.16b, v4.16b, v0.16b
         SMIN    v4.16b, v4.16b, v1.16b
-        B.LO    4f
+        B.LO    2f
 
         # Store full 1 x 16
         ST1     {v4.16b}, [x6], x12
@@ -93,26 +93,26 @@
 
         # Store odd width
         .p2align 3
-4:
-        TBZ     x1, 3, 5f
+2:
+        TBZ     x1, 3, 3f
         STR     d4, [x6], 8
         DUP     d4, v4.d[1]
-5:
-        TBZ     x1, 2, 6f
+3:
+        TBZ     x1, 2, 4f
         STR     s4, [x6], 4
         DUP     s4, v4.s[1]
-6:
-        TBZ     x1, 1, 7f
+4:
+        TBZ     x1, 1, 5f
         ST1     {v4.h}[0], [x6], 2
         DUP     h4, v4.h[1]
-7:
-        TBZ     x1, 0, 8f
+5:
+        TBZ     x1, 0, 6f
         ST1     {v4.b}[0], [x6]
-8:
+6:
         RET
 
 END_FUNCTION xnn_qs8_gemm_minmax_ukernel_1x16c4__aarch64_neondot_ld32
 
 #ifdef __ELF__
 .section ".note.GNU-stack","",%progbits
-#endif
\ No newline at end of file
+#endif
diff --git a/src/qs8-gemm/1x16c4-aarch64-neondot-ld64.S b/src/qs8-gemm/1x16c4-aarch64-neondot-ld64.S
index 914b4e7..15ab63b 100644
--- a/src/qs8-gemm/1x16c4-aarch64-neondot-ld64.S
+++ b/src/qs8-gemm/1x16c4-aarch64-neondot-ld64.S
@@ -147,4 +147,4 @@
 
 #ifdef __ELF__
 .section ".note.GNU-stack","",%progbits
-#endif
\ No newline at end of file
+#endif
diff --git a/src/qs8-gemm/4x16c4-aarch64-neondot-ld32.S b/src/qs8-gemm/4x16c4-aarch64-neondot-ld32.S
index b573ae7..3d21031 100644
--- a/src/qs8-gemm/4x16c4-aarch64-neondot-ld32.S
+++ b/src/qs8-gemm/4x16c4-aarch64-neondot-ld32.S
@@ -235,7 +235,7 @@
         SMIN    v5.16b, v5.16b, v1.16b
         SMIN    v6.16b, v6.16b, v1.16b
         SMIN    v7.16b, v7.16b, v1.16b
-        B.LO    4f
+        B.LO    2f
 
         # Store full 4 x 16
         ST1     {v4.16b}, [x6], x12
@@ -251,8 +251,8 @@
 
         # Store odd width
         .p2align 3
-4:
-        TBZ     x1, 3, 5f
+2:
+        TBZ     x1, 3, 3f
         STR     d4, [x6], 8
         DUP     d4, v4.d[1]
         STR     d5, [x8], 8
@@ -261,8 +261,8 @@
         DUP     d6, v6.d[1]
         STR     d7, [x7], 8
         DUP     d7, v7.d[1]
-5:
-        TBZ     x1, 2, 6f
+3:
+        TBZ     x1, 2, 4f
         STR     s4, [x6], 4
         DUP     s4, v4.s[1]
         STR     s5, [x8], 4
@@ -271,8 +271,8 @@
         DUP     s6, v6.s[1]
         STR     s7, [x7], 4
         DUP     s7, v7.s[1]
-6:
-        TBZ     x1, 1, 7f
+4:
+        TBZ     x1, 1, 5f
         ST1     {v4.h}[0], [x6], 2
         DUP     h4, v4.h[1]
         ST1     {v5.h}[0], [x8], 2
@@ -281,13 +281,13 @@
         DUP     h6, v6.h[1]
         ST1     {v7.h}[0], [x7], 2
         DUP     h7, v7.h[1]
-7:
-        TBZ     x1, 0, 8f
+5:
+        TBZ     x1, 0, 6f
         ST1     {v4.b}[0], [x6]
         ST1     {v5.b}[0], [x8]
         ST1     {v6.b}[0], [x9]
         ST1     {v7.b}[0], [x7]
-8:
+6:
         RET
 
 END_FUNCTION xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_ld32